#' Read the methylation file.
#'
#' @import dplyr
#' @import ffbase
#'
#' @description This function reads all of the methylation files and generates one file with all samples including methylated read coverages (Cs) and unmethylated read coverages (Ts).
#' It can automatically test how many samples and how many replicates in each group and the distribute them from 1_1, 1_2 to the final file by headers.
#' The methylation files should be the standard coverage file (i.e., .bismark.cov) outputted from Bismark software.
#' The dataset of the example is the Reduced representation bisulfite sequencing (RRBS) data of DNA methylation for mouse myeloid progenitor tissue from GEO (Accession number: GSE62392)
#' (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE62392).
#'
#' @param paths refers to the path of methylation file, with default the package path.
#' @param suffix refers to the suffix of methylation file, e.g., ".gz", ".zip" and so on (some files are in text .txt format, then ".txt" or ".txt.gz"), with default ".gz".
#' @param control_paths refers to the path of control groups, with default NULL.
#' @param case_paths refers to the path of case groups, with default NULL.
#' @param WGBS refers to whether to use Whole genome bisulfite sequencing (WGBS) data, with default FALSE.
#'
#' @return Outputs a data frame contain chromosome, position, and Cs & Ts for different replicates and groups.
#'
#' @examples
#' # test the functions with default parameters #
#' inputmethfile <- Methfile_read()
#' inputmethfile <- Methfile_read(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""), suffix = ".gz")
#' inputmethfile <- Methfile_read(paths = "C:/Users/GeneDMRs/methdata/", suffix = ".txt.gz", WGBS = TRUE)
#'
#' # if only case and control group (n = 2) paths are provided #
#' controls <- c("C:/Users/GeneDMRs/methdata/1_1.gz", "C:/Users/GeneDMRs/methdata/1_2.gz", "C:/Users/GeneDMRs/methdata/1_3.gz")
#' cases <- c("C:/Users/GeneDMRs/methdata/2_1.gz", "C:/Users/GeneDMRs/methdata/2_1.gz")
#' inputmethfile <- Methfile_read(control_paths = controls, case_paths = cases)
#'
#' @export
# So, what are the origanal coverage file and combined methall file looking like? e.g. data from mouse #
# Coverage file from Bismark without header, so don't add the header. If the cov file is with header, please use "chr", "posi", "Cs", "Ts" for four columns #
# It contains six columns that are choromsome, start position, end position, methylated percentage, methylated read coverage and unmethylated read coverage #
# Methylated percentage = Methylated read coverage / (Methylated read coverage and Unmethylated read coverage) #
#1 chr1 3020877 3020877 97.46835 77 2
#2 chr1 3020891 3020891 92.40506 73 6
#3 chr1 3020946 3020946 88.67925 47 6
#4 chr1 3020988 3020988 98.64865 73 1
#5 chr1 3021013 3021013 100.00000 74 0
#6 chr1 3094122 3094122 0.00000 0 1
#7 chr1 3094126 3094126 100.00000 1 0
#8 chr1 3150008 3150008 100.00000 3 0
#9 chr1 3150022 3150022 100.00000 3 0
#10 chr1 3150068 3150068 100.00000 1 0
# The output file of this function is with header and combines all the files with the same chromosome and position #
# It contains choromsome, position, methylated read coverages (Cs) and unmethylated read coverages (Ts) of each sample #
# chr posi Cs1_1 Ts1_1 Cs1_2 Ts1_2 Cs1_3 Ts1_3 Cs2_1 Ts2_1 Cs2_2 Ts2_2
#1 chr1 3020877 77 2 77 7 49 2 31 4 68 0
#2 chr1 3020891 73 6 78 6 49 2 33 2 68 0
#3 chr1 3020946 47 6 96 17 71 9 52 5 71 12
#4 chr1 3020988 73 1 58 0 57 6 55 2 61 2
#5 chr1 3021013 74 0 56 2 59 4 49 8 63 0
#6 chr1 3531651 11 1 25 0 13 1 7 0 15 1
#7 chr1 3531658 12 0 25 0 12 2 7 0 16 0
#8 chr1 3531671 12 0 25 0 13 1 6 1 16 0
#9 chr1 3531676 12 0 25 0 14 0 7 0 16 0
#10 chr1 3531680 12 0 22 3 10 3 7 0 14 1
# The function for automatically reading different replicates of different groups #
Methfile_read <- function(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""),
control_paths = NULL, case_paths = NULL,
suffix = ".gz",
WGBS = FALSE){
# Output the begin time
print(paste("The start time is ", date(), sep = ""))
# if case and control group paths are provided #
if(is.null(control_paths) == F & is.null(case_paths) == F){
# from two group to different relicate files #
realreplicatenum_control <- length(control_paths)
realreplicatenum_case <- length(case_paths)
# count the number of file #
filenum <- 0
# control group #
for(j in 1:realreplicatenum_control){
# read cov file (without header) from Bismark output and save the column of read number of Ts and Cs #
if(WGBS == TRUE){
# if use WGBS data, read cov file based on ffbase package #
tmpfile <- read.table.ffdf(x = NULL, file = control_paths[j], FUN = "read.table",
nrows = -1, header = F)
}else{
tmpfile <- read.table(file = control_paths[j], header = F)
}
# check column number of the input file
if(ncol(tmpfile) == 7){
tmpfile <- tmpfile[, -7]
}
else if(ncol(tmpfile) > 7){
stop("Stop and please check the input file format")
}
# If the cov file is with header, use "chr", "posi", "Cs", "Ts" for four columns #
if(tmpfile[1,1] == "chr" | tmpfile[1,2] == "posi"){
if(WGBS == TRUE){
tmpfile <- read.table.ffdf(x = NULL, file = control_paths[j], FUN = "read.table",
nrows = -1, header = T)
}else{
tmpfile <- read.table(file = control_paths[j], header = F)
}
# arrange using the header #
tmpfile <- tmpfile[, c(tmpfile$chr, tmpfile$posi, tmpfile$Cs, tmpfile$Ts)]
}else{
tmpfile <- tmpfile[, -c(3,4)]
}
# rename the header by replicate number of group number #
colnames(tmpfile) <- c("chr","posi",paste("Cs",1,"_",j,sep = ""),paste("Ts",1,"_",j,sep = ""))
filenum <- filenum + 1
# combine each replicate file to the total dataset #
ifelse(filenum==1, methallfile_control <- tmpfile,
methallfile_control <- inner_join(methallfile_control, tmpfile, by = c('chr'='chr','posi'='posi')))
}
# count the number of file #
filenumcontrol <- filenum
filenum <- 0
# case group #
for(j in 1:realreplicatenum_case){
# read cov file (without header) from Bismark output and save the column of read number of Ts and Cs #
if(WGBS == TRUE){
# if use WGBS data, read cov file based on ffbase package #
tmpfile <- read.table.ffdf(x = NULL, file = case_paths[j], FUN = "read.table",
nrows = -1, header = F)
}else{
tmpfile <- read.table(file = case_paths[j], header = F)
}
# check column number of the input file
if(ncol(tmpfile) == 7){
tmpfile <- tmpfile[, -7]
}
else if(ncol(tmpfile) > 7){
stop("Stop and please check the input file format")
}
# If the cov file is with header, use "chr", "posi", "Cs", "Ts" for four columns #
if(tmpfile[1,1] == "chr" | tmpfile[1,2] == "posi"){
if(WGBS == TRUE){
tmpfile <- read.table.ffdf(x = NULL, file = case_paths[j], FUN = "read.table",
nrows = -1, header = T)
}else{
tmpfile <- read.table(file = case_paths[j], header = F)
}
# arrange using the header #
tmpfile <- tmpfile[, c(tmpfile$chr, tmpfile$posi, tmpfile$Cs, tmpfile$Ts)]
}else{
tmpfile <- tmpfile[, -c(3,4)]
}
# rename the header by replicate number of group number #
colnames(tmpfile) <- c("chr","posi",paste("Cs",2,"_",j,sep = ""),paste("Ts",2,"_",j,sep = ""))
filenum <- filenum + 1
# combine each replicate file to the total dataset #
ifelse(filenum==1, methallfile_case <- tmpfile,
methallfile_case <- inner_join(methallfile_case, tmpfile, by = c('chr'='chr','posi'='posi')))
}
# merge methallfile_control and methallfile_case together #
methallfile <- inner_join(methallfile_control, methallfile_case, by = c('chr'='chr','posi'='posi'))
# total file number #
filenum <- filenum + filenumcontrol
}else{
# set the paths #
setwd(paths)
# count the number of file #
filenum <- 0
# detect the total group number #
groupnum <- length(grep("_1", dir(paths)))
# from different group to different relicate files #
for(i in 1:groupnum){
# dectect the real replicate number of this group #
# output the real replicate number #
realreplicatenum <- length(grep(paste(i,"_",sep = ""), dir(paths)))
# read the file from each real replicate of each group and combine the same chromosome and position #
for(j in 1:realreplicatenum){
# read cov file (without header) from Bismark output and save the column of read number of Ts and Cs #
if(WGBS == TRUE){
# if use WGBS data, read cov file based on ffbase package #
tmpfile <- read.table.ffdf(x = NULL, file = paste(i, "_", j, suffix, sep = ""), FUN = "read.table",
nrows = -1, header = F)
}else{
tmpfile <- read.table(file = paste(i, "_", j, suffix, sep = ""), header = F)
}
# check column number of the input file
if(ncol(tmpfile) == 7){
tmpfile <- tmpfile[, -7]
}
else if(ncol(tmpfile) > 7){
stop("Stop and please check the input file format")
}
# If the cov file is with header, use "chr", "posi", "Cs", "Ts" for four columns #
if(tmpfile[1,1] == "chr" | tmpfile[1,2] == "posi"){
if(WGBS == TRUE){
tmpfile <- read.table.ffdf(x = NULL, file = paste(i, "_", j, suffix, sep = ""), FUN = "read.table",
nrows = -1, header = T)
}else{
tmpfile <- read.table(file = paste(i, "_", j, suffix, sep = ""), header = F)
}
# arrange using the header #
tmpfile <- tmpfile[, c(tmpfile$chr, tmpfile$posi, tmpfile$Cs, tmpfile$Ts)]
}else{
tmpfile <- tmpfile[, -c(3,4)]
}
# rename the header by replicate number of group number #
colnames(tmpfile) <- c("chr","posi",paste("Cs",i,"_",j,sep = ""),paste("Ts",i,"_",j,sep = ""))
filenum <- filenum + 1
# combine each replicate file to the total dataset #
ifelse(filenum==1, methallfile <- tmpfile, methallfile <- inner_join(methallfile, tmpfile, by = c('chr'='chr','posi'='posi')))
}
}
}
# Sort by chromosome and position #
methallfile <- arrange(methallfile,chr,posi)
# Output how many files are read and the running timen #
print(paste("Total methylation file number is", filenum, "and the reading finish time is", date(), sep = " "))
return(methallfile)
}
# Finally output the methallfile as described before #
# If there are some Warning messages likes this: Column `chr` joining factors with different levels, coercing to character vector. #
# It doesn't matter because the tool dplyr tested that one of the columns was a factor and that factor had different levels in the different datasets. #
# In order not to lose any information, the factors were converted to character values. #
#' Read the standard bedfile of refseq or cpgi downloaded from UCSC.
#'
#' @import dplyr
#' @import genomation
#'
#' @description This function reads the bed file of refseq or cpgi and sorts them by chromosome and position.
#' The dataset of the example are the mouse reference genes and CpG island information that are downloaded from UCSC website
#' (http://genome.ucsc.edu/cgi-bin/hgTables).
#' The R package genomation used here can divide the refseq.bed file into several gene body features, e.g., promoter, exon, intron regions and
#' the cpgi.bed file into CpG island features, e.g., CpG island and CpG island shore.
#'
#' @param paths refers to the path of bed file, with default the package path.
#' @param bedfile refers to the file name of bed file like "refseq" or "cpgi". This file is downloaded from UCSC website, with default "refseq".
#' @param suffix refers to the suffix of bed file, e.g., ".gz", ".zip" and so on (some files are in text .txt format, then ".txt" or ".txt.gz"), with default ".txt".
#' @param feature refers to whether to read the bed with the features, with default FALSE.
#' If feature = TRUE, the output of this function will contain the features e.g., promoter, exon, intron or CpG island, CpG island shore based on R package genomation.
#'
#' @param featurewrite refers to whether to write out the feature file to the given path, with default FALSE.
#'
#' @return Outputs a data frame contains four columns of chromosome, start position, end position.
#' If feature = TRUE, the data frame is five columns with the added feature such as genebody or cpgfeature.
#'
#' @references Akalin A, Franke V, Vlahovicek K, Mason C, Schubeler D (2014). "genomation: a toolkit to summarize, annotate and visualize genomic intervals."
#' Bioinformatics. doi: 10.1093/bioinformatics/btu775, http://bioinformatics.oxfordjournals.org/content/early/2014/12/04/bioinformatics.btu775.long.
#'
#' @examples
#' inputrefseqfile <- Bedfile_read()
#' inputrefseqfile <- Bedfile_read(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""), bedfile = "refseq", suffix = ".txt", feature = FALSE)
#' inputcpgifile <- Bedfile_read(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""), bedfile = "cpgi", suffix = ".txt", feature = FALSE)
#' head(inputrefseqfile)
#' head(inputcpgifile)
#'
#' inputgenebodyfile <- Bedfile_read(bedfile = "refseq", feature = TRUE, featurewrite = TRUE)
#' inputcpgifeaturefile <- Bedfile_read(bedfile = "cpgi", feature = TRUE, featurewrite = FALSE)
#' head(inputgenebodyfile)
#' head(inputcpgifeaturefile)
#'
#' @export
# So, what are the origanally standard bedfile and outputed file looking like? e.g. data from pig #
# Bedfile is without header, so don't add the header #
# refseq.bed file #
# It contains 12 columns that are choromsome, chromstart (thickStart) position, chromend (thickEnd) position, #
# NCBI ID number for mRNA, score, strand, coding start position, coding end position, score, number of exon, length of exon, distance from TSS start position to exon #
#1 chr1 75465296 75508244 NM_001243664 0 - 75465385 75508212 0 8 201,136,57,148,63,189,96,155, 0,11425,20986,26474,26728,35769,39257,42793,
#2 chr1 5698507 6731132 NM_001044603 0 + 5698507 6729846 0 11 166,229,122,84,116,137,62,150,84,118,1399, 0,160362,224013,370124,439504,592645,812774,826595,994107,1025825,1031226,
#3 chr1 73253304 73510244 NM_001167653 0 - 73253304 73510244 0 8 159,33,132,174,75,199,135,278, 0,27538,48205,49420,82314,106754,149871,256662,
#4 chr1 77585913 77642342 NM_001080206 0 - 77585913 77642342 0 11 209,132,154,77,180,165,150,104,99,97,247, 0,10778,30879,31123,32747,36536,39881,41003,44299,52245,56182,
#5 chr1 85758623 86017579 NM_001123219 0 - 85758623 86017579 0 10 141,87,111,98,109,156,134,69,78,208, 0,71926,167857,177254,203119,205053,205423,243559,246232,258748,
#6 chr1 107997304 108131723 NM_001243563 0 + 108003699 108131137 0 12 99,98,222,139,130,49,27,153,46,90,84,667, 0,6374,73642,82787,101201,107787,108541,117080,118263,130344,130541,133752,
#7 chr1 122633627 122688609 NM_001206341 0 + 122633907 122685089 0 18 304,179,325,136,230,25,116,135,81,168,142,170,133,163,221,155,220,4200, 0,8356,10502,13455,17533,18050,18724,23354,25146,25780,26091,29689,30624,32730,34495,36698,43102,50782,
#8 chr1 128960122 129013424 NM_214171 0 - 128960728 129013201 0 24 633,59,117,79,69,65,58,78,114,18,37,209,12,170,161,78,86,84,147,169,134,119,70,532, 0,1029,1338,1658,1825,2366,2536,2948,4018,6625,8220,8935,9780,10414,11725,12641,14652,15615,17664,18622,19485,21066,23205,52770,
#9 chr1 188704259 188925855 NM_001244640 0 - 188705091 188925855 0 9 875,59,47,70,139,208,732,774,241, 0,5651,5795,6353,7825,10764,108217,118752,221355,
#10 chr1 189690481 189907808 NM_001166316 0 + 189690481 189907808 0 8 89,153,74,104,141,126,122,121, 0,55712,57518,67005,73033,78455,118475,217206,
# cpgi.bed file #
# It contains 4 columns that are choromsome, start position, end position, CpG ID #
#1 chr1 21811 22330 CpG:_48
#2 chr1 23707 24083 CpG:_41
#3 chr1 66380 66649 CpG:_22
#4 chr1 91255 91533 CpG:_19
#5 chr1 122129 122347 CpG:_20
#6 chr1 139652 140059 CpG:_32
#7 chr1 160025 160481 CpG:_33
#8 chr1 160781 160986 CpG:_20
#9 chr1 174275 174652 CpG:_36
#10 chr1 186372 187350 CpG:_119
# The output file of this function is with header and combines with the other features #
# The inputrefseqfile contains NCBI ID of gene, chromosome, start position, end position.
# refseq chr start end
# 1 NM_001244353 chr1 23826 40033
# 2 NR_128500 chr1 1013706 1013802
# 3 NR_128506 chr1 1340182 1340263
# 4 NR_128509 chr1 1665567 1665643
# 5 NM_001244864 chr1 2541536 2552860
# 6 NM_001244534 chr1 2576814 2593516
# 7 NR_128518 chr1 2893741 2893818
# 8 NM_001007195 chr1 4738530 4883241
# 9 NM_001044603 chr1 5698507 6731132
# 10 NM_001143697 chr1 6807761 6883255
# The inputcpgifile contains CpG ID, chromosome, start position, end position.
# cpgi chr start end
# 1 CpG:_48 chr1 21811 22330
# 2 CpG:_41 chr1 23707 24083
# 3 CpG:_22 chr1 66380 66649
# 4 CpG:_19 chr1 91255 91533
# 5 CpG:_20 chr1 122129 122347
# 6 CpG:_32 chr1 139652 140059
# 7 CpG:_33 chr1 160025 160481
# 8 CpG:_20 chr1 160781 160986
# 9 CpG:_36 chr1 174275 174652
# 10 CpG:_119 chr1 186372 187350
# The inputgenebodyfile contains NCBI ID of gene, chromosome, start position, end position and gene feature.
# refseq chr start end genebody
# 1 NM_001244353 chr1 22826 24826 promoters
# 2 NM_001244353 chr1 23826 23826 TSSes
# 3 NM_001244353 chr1 23827 23965 exons_1
# 4 NM_001244353 chr1 23966 27359 introns_1
# 5 NM_001244353 chr1 27360 27467 exons_2
# 6 NM_001244353 chr1 27468 30767 introns_2
# 7 NM_001244353 chr1 30768 30849 exons_3
# 8 NM_001244353 chr1 30850 33299 introns_3
# 9 NM_001244353 chr1 33300 33429 exons_4
# 10 NM_001244353 chr1 33430 38650 introns_4
# The inputcpgifeaturefile contains chromosome, start position, end position and CpGfeature.
# cpgi chr start end cpgfeature
# 1 shore1 chr1 19811 21810 Shores_1
# 2 cpgi1 chr1 21811 22330 CpGisland_1
# 3 shore2 chr1 22331 23706 Shores_2
# 4 cpgi2 chr1 23707 24083 CpGisland_2
# 5 shore3 chr1 24084 26083 Shores_3
# 6 shore4 chr1 64380 66379 Shores_4
# 7 cpgi3 chr1 66380 66649 CpGisland_3
# 8 shore5 chr1 66650 68649 Shores_5
# 9 shore6 chr1 89255 91254 Shores_6
# 10 cpgi4 chr1 91255 91533 CpGisland_4
Bedfile_read <- function(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""), bedfile = "refseq",
suffix = ".txt", feature = FALSE, featurewrite = TRUE){
# set the paths #
setwd(paths)
# read refseq or cpgi file #
if(bedfile=="refseq"){
if(feature == FALSE){
regionfile <- Regionfile_read(bedfile, suffix)
return(regionfile)
}else{
# region file is the reference gene or cpg bed file (without header) #
regionfile <- Regionfile_read(bedfile, suffix)
geneobj <- readTranscriptFeatures(paste(bedfile, ".bed", suffix, sep = ""))
# Output the defined gene features based on refseq.bed file#
write.table(geneobj, "Genebody", col.names = F, row.names = F, quote = F)
geneobj <- read.table("Genebody")
# check the geneobj file for promoter regions #
geneobj <- refseqfile_check(geneobj, regionfile)
# if featurewrite == FALSE then delete geneobj file #
if(featurewrite == FALSE){
file.remove("Genebody")
}
return(geneobj)
}
}else if(bedfile=="cpgi"){
if(feature == FALSE){
regionfile <- Regionfile_read(bedfile, suffix)
return(regionfile)
}else{
# region file is the reference gene or cpg bed file (without header) #
cpgiobj <- readFeatureFlank(paste(bedfile, ".bed", suffix, sep = ""), feature.flank.name = c("CpGisland", "Shores"))
# Output the defined gene features based on refseq.bed file #
write.table(cpgiobj, "Cpgifeature", col.names = F, row.names = F, quote = F)
cpgiobj <- read.table("Cpgifeature")
cpgiobj[,6] <- cpgiobj[,2]
cpgiobj <- cpgiobj[,3:6]
colnames(cpgiobj) <- c("chr", "start", "end", "cpgfeature")
# rename the "cpgi" #
cpginum <- sum(cpgiobj$cpgfeature == "CpGisland")
cpginame <- c(paste("cpgi", 1:cpginum, sep = ""), paste("shore", 1:(nrow(cpgiobj) - cpginum), sep = ""))
cpgiobj <- data.frame(cpgi = cpginame, cpgiobj)
# sort the file #
cpgiobj <- arrange(cpgiobj, chr, start)
# if featurewrite == FALSE then delete cpgiobj file #
if(featurewrite == FALSE){
file.remove("Cpgifeature")
}
return(cpgiobj)
}
}else{
# inform only refseq or cpgi file can be read #
stop("Wrong bed file names and please use refseq or cpgi name")
}
}
#' Internal Use Function
#' That reads the bed file.
#'
#' @description This function reads the bed file and sort them.
#'
#' @param bedfile refers to the bedfile name (e.g., refseq or cpgi) downloaded from UCSC, with the default refseq.
#' @param suffix refers to the compressed file suffix such as ".gz", ".zip" and so on, with the default suffix ".gz".
#'
#' @return Outputs bed file that can be used in the next step.
#'
#' @examples
#' regionfile <- Regionfile_read(bedfile, suffix)
#'
#' @export
Regionfile_read <- function(bedfile, suffix){
# region file is the reference gene or cpg bed file (without header)
regionfile <- read.table(paste(bedfile, ".bed", suffix, sep = ""), header=F)
regionfile <- cbind(regionfile[, c(4,1,2,3)], 0)
colnames(regionfile) <- c(bedfile, "chr", "start", "end", "uniid")
# unique the repeated positions #
regionfile$uniid <- paste(regionfile$chr, regionfile$start, regionfile$end, sep = "_")
regionfile <- distinct(regionfile, uniid, .keep_all = TRUE)
#sort by chromosome and position
regionfile <- arrange(regionfile, chr, start)
return(regionfile[,-5])
}
#' Internal Use Function
#' That checks and annotates the promoter to the specific gene.
#'
#' @description This function sorts and makes sure geneobj file can be used in this package,
#' that mainly works on the promoters without the choromsome and position information.
#' In addition, it can add the genomic number of exons and introns, e.g., first exon or second intron.
#'
#' @param geneobj refers to the genebody file from readTranscriptFeatures of package genomation.
#' @param regionfile refers to the reference gene or cpg bed file (without header)
#'
#' @return Outputs gene body file.
#'
#' @examples
#' geneobj <- refseqfile_check(geneobj, regionfile)
#'
#' @export
refseqfile_check <- function(geneobj, regionfile){
geneobj <- geneobj[,c(9,3,4,5,2,7)]
colnames(geneobj) <- c("refseq","chr","start","end","genebody","strand")
## promoters without the choromsome and position information ##
tmp_promoter <- filter(geneobj, genebody == "promoters")
tmp_nopromoter <- filter(geneobj, genebody != "promoters")
print(paste("The total reading line for promoter is", nrow(tmp_promoter), sep = " "))
for(i in 1:nrow(tmp_promoter)){
# fill up the missing information of promoters #
if(as.vector(unlist(tmp_promoter$strand))[i] == "+"){
# when the strand is "+", then start position of gene is in the middle of promoter #
tmp <- filter(regionfile, chr == as.vector(unlist(tmp_promoter$chr))[i], start == (tmp_promoter[i,3] + 1000))
}else{
# when the strand is "-", then end position of gene is in the middle of promoter #
tmp <- filter(regionfile, chr == as.vector(unlist(tmp_promoter$chr))[i], end == (tmp_promoter[i,3] + 1000))
}
# replace '.' by NCBI ID #
# if the chromosome and position are same but with two NCBI IDs, juse use the first one #
tmp_promoter[i,1] <- as.vector(unlist(tmp$refseq))[1]
}
geneobj <- rbind(tmp_promoter, tmp_nopromoter)
# unique the repeated positions #
geneobj$strand <- paste(geneobj$chr, geneobj$start, geneobj$end, sep = "_")
geneobj <- distinct(geneobj, strand, .keep_all = TRUE)
## add the number of exon and intron ##
tmp_exonintron <- filter(geneobj, genebody == "exons" | genebody == "introns")
tmp_noexonintron <- filter(geneobj, genebody != "exons" & genebody != "introns")
# sort the file #
tmp_exonintron <- arrange(tmp_exonintron, chr, refseq, start, genebody)
# add one first row and one last row #
tmp_exonintron <- rbind(tmp_noexonintron[1, ], tmp_exonintron, tmp_noexonintron[1, ])
numexon <- 1
numintron <- 1
for(j in 2:(nrow(tmp_exonintron) - 1)){
# for same gene #
if(as.vector(unlist(tmp_exonintron$refseq))[j] == as.vector(unlist(tmp_exonintron$refseq))[j + 1]){
if(as.vector(unlist(tmp_exonintron$genebody))[j] == "exons"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("exons", numexon, sep = "_")
numexon <- numexon + 1
}else if(as.vector(unlist(tmp_exonintron$genebody))[j] == "introns"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("introns", numintron, sep = "_")
numintron <- numintron + 1
}
}else{
# last gene body of the same gene #
if(as.vector(unlist(tmp_exonintron$refseq))[j] == as.vector(unlist(tmp_exonintron$refseq))[j - 1]){
if(as.vector(unlist(tmp_exonintron$genebody))[j] == "exons"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("exons", numexon, sep = "_")
}else if(as.vector(unlist(tmp_exonintron$genebody))[j] == "introns"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("introns", numintron, sep = "_")
}
numexon <- 1
numintron <- 1
}else{
# for the different gene #
numexon <- 1
numintron <- 1
if(as.vector(unlist(tmp_exonintron$genebody))[j] == "exons"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("exons", numexon, sep = "_")
}else if(as.vector(unlist(tmp_exonintron$genebody))[j] == "introns"){
tmp_exonintron[j, ncol(tmp_exonintron)] <- paste("introns", numintron, sep = "_")
}
}
}
}
# delete the first row, last row and last column #
tmp_exonintron$genebody <- tmp_exonintron[, ncol(tmp_exonintron)]
tmp_exonintron <- tmp_exonintron[-c(1, nrow(tmp_exonintron)), -ncol(tmp_exonintron)]
geneobj <- rbind(tmp_exonintron, tmp_noexonintron[, -ncol(tmp_noexonintron)])
# sort the file #
geneobj <- arrange(geneobj, chr, start, refseq)
return(geneobj)
}
#' Read the cyto file.
#'
#' @description This function reads the chromosome information from cyto file (cytoBandIdeo.txt) and sort them by chromosome and position.
#' The dataset of the example is the mouse genome information downloaded from UCSC website
#' (http://hgdownload.cse.ucsc.edu/goldenPath/mm10/database/cytoBandIdeo.txt.gz).
#'
#' @param paths refers to the path of input file, with default the package path.
#' @param cytofile refers to the name of input cyto file that is downloaded from UCSC website, with default "cytoBandIdeo".
#' @param suffix refers to the suffix of input cyto file, e.g., ".gz", ".zip" and so on (some files are in text .txt format, then ".txt" or ".txt.gz"), with default ".txt.gz".
#'
#' @return Outputs a data frame contains chromosome, start position, end position.
#'
#' @examples
#' inputcytofile <- Cytofile_read()
#' inputcytofile <- Cytofile_read(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""),
#' cytofile = "cytoBandIdeo", suffix = ".txt.gz")
#'
#' @export
Cytofile_read <- function(paths = paste(system.file(package = "GeneDMRs"), "/methdata", sep=""), cytofile = "cytoBandIdeo",
suffix = ".txt.gz"){
# set the paths #
setwd(paths)
#chromosome info#
cyto <- read.table(paste(cytofile, suffix, sep = ""), fill = T, header=F)
# five columns or four columns #
if(ncol(cyto)==4){
cyto <- cbind(cyto,"gneg")
cyto[, 4] <- "p"
}
colnames(cyto) <- c("chr","start","end","band","stain")
# find the unannotated chromosome rows and delete them #
unqualifiedrow1 <- grep("_", cyto$chr)
if(length(unqualifiedrow1) > 0){
cyto <- cyto[-unqualifiedrow1,]
}
# find the chromosome M and delete it #
unqualifiedrow2 <- grep("M", cyto$chr)
if(length(unqualifiedrow2) > 0){
cyto <- cyto[-unqualifiedrow2,]
}
# transfer chr column to character #
cyto$chr <- as.vector(unlist(cyto$chr))
# set the chromosome label to number as the default of qqman #
chromtable <- table(cyto$chr)
# set the chromosome label to number as the default of qqman #
for(i in 1:length(chromtable[chromtable != 0])){
cyto$chr[cyto$chr == paste("chr", i ,sep="")] <- i
}
cyto$chr[cyto$chr == "chrX"] <- length(chromtable[chromtable != 0]) - 1
cyto$chr[cyto$chr == "chrY"] <- length(chromtable[chromtable != 0])
# transfer chr column to numeric again for sort #
cyto$chr <- as.numeric(cyto$chr)
cyto <- cyto[order(cyto$chr), ]
# paste "chr" to chromosome number #
cyto$chr <- paste("chr", cyto$chr, sep = "")
cyto$chr[cyto$chr == paste("chr", (length(chromtable[chromtable != 0]) - 1), sep = "")] <- "chrX"
cyto$chr[cyto$chr == paste("chr", length(chromtable[chromtable != 0]), sep = "")] <- "chrY"
return(cyto)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.