#' @title <TDm6A_application>
#'
#' @param faFile_Input full path of the input fasta file storing the ID and sequence information
#' @param plot_Output full path of the output .png file to show the predicted m6A sites on the given sequence
#' @param csvFile_Output full path of the output .csv file to store the predicted results
#' @param cellType a string to be "A549", or "CD8T", or "HEK293"
#' @return NULL
#' @description This function is used to apply TDm6A models to predict m6A sites on a given nucleotide sequence
#' @export
TDm6A_application <- function(faFile_Input,plot_Output,csvFile_Output, cellType){
### refine the dataset:
fa = read.fasta(faFile_Input, as.string = TRUE, forceDNAtolower = FALSE,whole.header = TRUE)
fa = as.data.frame(unlist(fa))
fa$ID = rownames(fa)
colnames(fa) = c("Sequence","ID")
fa$Sequence = as.character(fa$Sequence)
fa$Sequence = toupper(fa$Sequence)
fa$Sequence = sub("\n","",fa$Sequence)
#### find the all the DRACH sites in lncRNAs:
print("Running step 1: Finding all adenosine sites conforming to the DRACH motif...\n")
DRACH_all = gregexpr("[AGT][AG]AC[ACT]", fa$Sequence)
### create dataframe to save each DRACH site:
myData = as.data.frame(NA)
colnames(myData) = "DRACH_site"
### put into different rows:
DRACH_all = unlist(DRACH_all)
i = 1; # df row
j = 1; # DRACH site row
while (j <= length(DRACH_all)) {
myData[i,1] = DRACH_all[j]
i = i + 1;
j = j + 1;
}
#### find the Adenosine site
myData$A_site = NA;
myData$A_site = myData$DRACH_site +2;
#### extract the 500t*2 +1 sequences
print("Running step 2: Input feature preparation...\n")
myData$flanking_500nt = NA;
i = 1;
while ( i <= nrow(myData))
{
myData$flanking_500nt[i] = substr(fa$Sequence, myData$A_site[i]-500, myData$A_site[i] + 500);
if (myData$A_site[i] <= 500)
{
ga = rep("X", 501-myData$A_site[i]);
gap = paste(ga, collapse = "");
myData$flanking_500nt[i] = paste(gap, myData$flanking_500nt[i], sep = "");
}
if (nchar(fa$Sequence) - myData$A_site[i] < 500)
{
ga = rep("X", 500 + myData$A_site[i] - nchar(fa$Sequence));
gap = paste(ga, collapse = "");
myData$flanking_500nt[i] = paste(myData$flanking_500nt[i], gap, sep = "");
}
i = i + 1;
}
if(mean(nchar(myData$flanking_500nt)) == 1001)
{
print("1001nt sequences checked.\n")
}else{
print("Error for extracting flanking sequences.\n")
stop("Program stopped.\n")
}
##### one-hot encoding:
myData$encoded_seq = NA
i = 1;
while ( i <= nrow(myData))
{
code_seq_A = gsub("A", "1000", myData$flanking_500nt[i]);
code_seq_C = gsub("C", "0100", code_seq_A);
code_seq_G = gsub("G", "0010", code_seq_C);
code_seq_T = gsub("[TU]", "0001", code_seq_G);
code_seq_X = gsub("X", "0000", code_seq_T);
myData$encoded_seq[i] = as.character(code_seq_X);
i = i + 1;
}
if(mean(nchar(myData$encoded_seq)) == 4004)
{
print("one-hot-encoding checked.\n")
}else{
print("Error for one-hot-encoding.\n")
stop("Program stopped.\n")
}
# convert to 3D array x_train:
x_array = array(data = NA, dim = c(4,1001, nrow(myData)));
print(dim(x_array));
s = 1;
r = 1;
c = 1;
j = 1;
while (s <= nrow(myData) )
{
c = 1;
while ( c <= 1001)
{
r = 1;
while (r <= 4)
{
x_array[r,c,s] = as.integer(substr(myData$encoded_seq[s],j,j))
r = r + 1;
j = j + 1;
}
c = c + 1;
}
s = s+1;
j = 1;
}
if(any(is.na(x_array)))
{
print("Error for matrix preparation.\n")
stop("Program stopped.\n")
}else{
print("Input data checked.\n")
}
### change the index order to the ones keras package can use:
x_test = aperm(x_array, c(3,2,1));
print(dim(x_test));
############################################################################## model prediction ###################################################
print("TDm6A predicting...\n")
## load models and predict:
modelName <- paste("TDm6A_",cellType,"_1.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_1 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_2.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_2 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_3.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_3 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_4.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_4 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_5.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_5 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_6.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_6 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_7.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_7 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_8.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_8 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_9.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_9 = model %>% predict_proba(x_test);
modelName <- paste("TDm6A_",cellType,"_10.hdf5", sep = "")
path <- system.file(modelName, package ="TDm6A")
model <- load_model_hdf5(filepath = path)
classes_10 = model %>% predict_proba(x_test);
classes = (classes_1 + classes_2 + classes_3 + classes_4 + classes_5 + classes_6 + classes_7 + classes_8 + classes_9 + classes_10)/10
print("TDm6A prediction finished.\n")
### read the position of lncRNA DRACH sites:
classes = as.data.frame(classes)
colnames(classes) = c("Pred_Neg_Prob","Pred_Posi_Prob")
myData = cbind.data.frame(myData,classes)
############################################### Plot:
print("Plotting...\n")
#### label the TP
myData$predict_class = "P"
i = 1;
while ( i <= nrow(myData))
{
if ( myData$Pred_Posi_Prob[i]<0.5)
{
myData$predict_class[i] = "N"
}
i = i + 1
}
colour <- c("black","red")
col.list <- rep(0,length(myData$predict_class))
col.list[myData$predict_class=="N"] <-1
col.list[myData$predict_class=="P"]<-2
#### save plot
png(plot_Output,width = 1200,height = 800)
par(mar=c(5,5,5,5))
plot(myData$A_site,myData$Pred_Posi_Prob,
ylim = c(0,1),xlim = c(1,nchar(fa$Sequence)),
col =c(colour[col.list]), lwd=1,
xlab = "Nucleotide position on sequence",
ylab="Probability to be m6A site",
main = fa$ID, type = "h",
cex=1.5,
cex.axis=1.5,
cex.lab=1.5)
legend("topright",legend = c("Positive m6A sites","Negative m6A sites"),
col=c("red","black"),
lty = 1:1,
cex = 1.5)
dev.off()
#### save predict result:
myData=myData[,c(2,5,6,7)]
write.csv(myData,csvFile_Output,row.names = FALSE)
print("\nDone\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.