#'PPI: Match PDB
#'
#'This function matches combined data to PDB files.
#'
#'@param xlink_df data.frame() as created by ppi.combineData()
#'@param fasta_file FASTA file as loaded by seqinr::read.fasta()
#'@param pdb_numbering If TRUE, will number the
#'@param pdb_directory pdb_directory
#'@param pdb_match_vector pdb_match_vector
#'@param csv_pdb_input_file csv_pdb_input_file
#'@param fasta_names_to_generate_all_2d_structures fasta_names_to_generate_all_2d_structures
#'@param file_type_2d file_type_2d
#'@param protein_alternative_names_dict protein_alternative_names_dict
#'@export
ppi.matchPDB <- function(xlink_df, fasta_file, pdb_numbering = FALSE,
pdb_directory = NULL, pdb_match_vector = NULL,
csv_pdb_input_file = NULL,
fasta_names_to_generate_all_2d_structures = NULL,
protein_alternative_names_dict = NULL,
download_pdb = TRUE,
file_type_2d = 'single_dot'){
#dist_xlink_list <- c()
#pdb1_xlink_list <- c()
#pdb2_xlink_list <- c()
if(!is.null(pdb_directory)){
experiment_directory <- pdb_directory
} else {
experiment_directory <- getwd()
}
xlink_list <- list()
list_of_protein_names <- names(fasta_file)
if(!is.null(protein_alternative_names_dict)){
protein_alternative_names_dict <- read.csv(protein_alternative_names_dict)
for(pand_name in names(protein_alternative_names_dict)){
#add entire column to list
pand_list <- as.character(protein_alternative_names_dict[,pand_name])
list_of_protein_names <- c(list_of_protein_names,pand_list)
}
#add to list of protein names
}
list_of_protein_names <- unique(list_of_protein_names)
current_directory <- getwd()
#Sys.setenv(TZ='Australia/Melbourne')
#create a system log file that has sys time on it instead?
#what about the %03d?
#new_directory_title <- paste('make_diff_analysis Run on',Sys.time())
#new_directory_title <- gsub(':','-',new_directory_title)
#new_directory <- paste(current_directory,'/',new_directory_title,sep = '')
#dir.create(new_directory)
pdb_suffix <- file_type_2d
# if(file_type_2d == 'single dot'){
# pdb_suffix <- 'single_dot'
# } else {
# pdb_suffix <- file_type_2d
# }
if(is.null(csv_pdb_input_file)){
if(pdb_numbering == TRUE){
if(is.null(pdb_match_vector)){
pdb_match_vector <- ppi.alignPDB(fasta_file = fasta_file)
}
} else {
#pdb_numbering == FALSE --> do what has been done before
generated_pdb_lists <- generate_pdb_lists_and_files_from_fasta(fasta_file,pdb_suffix,
fasta_names_to_generate_all_2d_structures)
#output this as a csv file?
#if option is selected yes --> can output as csv file
make_start_end_pdb_df_output2(generated_pdb_lists = generated_pdb_lists)
write.csv(make_start_end_pdb_df_output2(generated_pdb_lists = generated_pdb_lists),'pdb_start_stop.csv')
} #end else to if(pdb_numbering == TRUE){
} else { #if someone has indicated a PDB file to be used
generated_pdb_lists <- generate_pdb_lists_from_pdb_csv(csv_pdb_input_file)
#need same input as list of all pdb files
}
#return(generated_pdb_lists)
#if pdb_numbering == TRUE --> will need a substitute for these or otherwise ignore them
if(pdb_numbering == FALSE){
list_of_all_pdb_files <- generated_pdb_lists$all_pdb_files
list_of_start_and_end_pdbs <- generated_pdb_lists$start_end_chain
} else {
#end if(pdb_numbering == FALSE)
list_of_all_pdb_files <- pdb_match_vector
}
#xlink_df <- xlink.df
xlink_df$dist <- rep(NA,nrow(xlink_df))
xlink_df$pdb1 <- rep(NA,nrow(xlink_df))
xlink_df$pdb2 <- rep(NA,nrow(xlink_df))
stored_pdbs <- list()
#-----xlink_df cycle starts---------
for(row_num in 1:nrow(xlink_df)){
xlink_row <- xlink_df[row_num,]
otps_list <- c()
xyz_coord_list <- list()
for(pro_num in 1:2){
protein_name <- as.character(xlink_row[[paste0('pro_name',pro_num)]])
protein_pos <- as.character(xlink_row[[paste0('pro_pos',pro_num)]])
if(pdb_numbering == TRUE){
protein_pos_match <- match(protein_pos,pdb_match_vector[[protein_name]]$fasta)
pdb_protein_pos <- as.numeric(pdb_match_vector[[protein_name]]$pdb[protein_pos_match])
xlink_row[[paste0('pro_pos',pro_num)]] <- pdb_protein_pos
#xlink_df[[row_num,paste0('pro_pos',pro_num)]] <- pdb_protein_pos
#protein_split_list[[index_num]][2] <- pdb_protein_pos
#xlink_row
on_this_pdb_structure <- pdb_match_vector[[protein_name]]$chain[protein_pos_match]
xlink_df[row_num,paste0('pdb',pro_num)] <- on_this_pdb_structure
otps_list <- c(otps_list,on_this_pdb_structure)
} else {
on_this_pdb_structure <- get_pdb_structure_from_protein_pos(protein_name,
protein_pos,
list_of_all_pdb_files,
list_of_start_and_end_pdbs)
if(!is.null(on_this_pdb_structure)){
otps_list <- c(otps_list,on_this_pdb_structure)
xlink_df[row_num,paste0('pdb',pro_num)] <- on_this_pdb_structure
} #end if(!is.null(on_this_pdb_structure)){
} #end else to if(pdb_numbering == TRUE)
} #end for(pro_num in 1:2){
pdbs_are_same <- TRUE
if(pdb_numbering == TRUE){
pdbs_are_same <- (length(unique(unlist(strsplit(otps_list,'_'))[c(T,F)])) == 1)
}
if((length(otps_list) != 2) || ('-' %in% otps_list) || (pdbs_are_same == FALSE)){
cat('Skipping this crosslinking site, adding to list of XL sites not listed\n')
next
#cat('Skipping this crosslinking site, adding to list of XL sites not listed')
#add crosslinking site to list of not included crosslinking sites?
#list or dataframe?
#not_included_xl_sites <- c(not_included_xl_sites,seq_xlink2)
} #end if(length(otps_list) != 2)
#should change where these are?
#pdb1_xlink_list <- c(pdb1_xlink_list,otps_list[1])
#pdb2_xlink_list <- c(pdb2_xlink_list,otps_list[2])
#xlink_df[row_num,paste0('pdb',pro_num)] <- on_this_pdb_structure
# if(row_num == 5){
# return(otps_list)
# }
if((!(FALSE %in% grepl('___',otps_list))) || pdb_numbering == TRUE){ #make sure that it is all TRUEs to go forward with dist calculation
for(otps_index in 1:length(otps_list)){ #need to load the protein position as well
#otps_index <- 1
otps <- otps_list[otps_index]
if(pdb_numbering == TRUE){
#can check the temporary directory for the PDB file
#see if pdb_id.pdb exists within the tempdir
#if yes, can
otps_split <- strsplit(otps,'_')[[1]]
# if((row_num == 5) && (otps_index == 2)){
# return(otps_split)
# }
pdb_id <- otps_split[1]
chain <- otps_split[2]
if(is.na(pdb_id)){
break
}
#print(pdb_id)
if(paste0(pdb_id,'.pdb') %in% list.files(tempdir())){
pdb_read <- read.pdb2(paste0(tempdir(),'/',pdb_id,'.pdb'))
} else {
if(!(pdb_id %in% names(stored_pdbs))){
pdb_read <- read.pdb2(pdb_id)
stored_pdbs[[pdb_id]] <- pdb_read
} else {
pdb_read <- stored_pdbs[[pdb_id]]
}
} #end if(paste0(pdb_id,'.pdb') %in% list.files(tempdir()))
#once the pdb has been read --> get just the chain?
pdb_read$atom <- pdb_read$atom[pdb_read$atom$chain == chain,]
#use the read.pdb function to be able to
} else { #end if(pdb_numbering == TRUE)
if(!is.null(pdb_directory)){
#paste the directory to otps
if(!endsWith('/',pdb_directory)){
pdb_directory <- paste(pdb_directory,'/',sep='')
}
pdb_path <- paste(pdb_directory,otps,sep='')
pdb_read <- read.pdb2(pdb_path)
} else {
pdb_read <- read.pdb2(otps)
}
} #end else to if(pdb_numbering == TRUE)
# if((row_num == 5) && (otps_index == 2)){
# return(pdb_read)
# }
#xyz_coords <- get_xyz_coordinates_pdb(pdb_read)
protein_name <- as.character(xlink_row[[paste0('pro_name',otps_index)]])
protein_pos <- as.character(xlink_row[[paste0('pro_pos',otps_index)]])
pdb_read_chain <- pdb_read$atom[pdb_read$atom$resno == protein_pos,]
xyz_matches <- as.numeric(pdb_read_chain[pdb_read_chain$elety == 'CA',][c('x','y','z')])
xyz_coord_list[[otps_index]] <- xyz_matches
} #end of otps_index in 1:length(opts_list) for loop
#calculate distance here
#double-check formula before using in the final version of distance
# if(row_num == 5){
# return(xyz_coord_list)
# }
if(length(xyz_coord_list) == 2){
dist_xyz <- dist.xyz(xyz_coord_list[[1]],xyz_coord_list[[2]])[1,1]
#dist_xlink_list <- c(dist_xlink_list,dist_xyz)
xlink_df[row_num,'dist'] <- dist_xyz
} else {
#dist_xlink_list <- c(dist_xlink_list,NA)
}
} else { #one or more PDB structures is not "real" so distance should not be calculated
#dist_xlink_list <- c(dist_xlink_list,NA)
}
} #end for(row_num in 1:nrow(xlink_df))
# if(length(dist_xlink_list) != nrow(xlink_df)){
# warning('Number of distances does not match original data.frame length\nReturning NULL')
# return(NULL)
# }
#xlink_df$dist <- dist_xlink_list
#xlink_df$pdb1 <- pdb1_xlink_list
#xlink_df$pdb2 <- pdb2_xlink_list
return(xlink_df)
} #end ppi.matchPDB function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.