R/FormatConverters.R

Defines functions FormatEnsembl CharList2DF GRangesToBed BedToGRanges

### INFO: Converts between genomic formats
### DATE: 31.03.2011
### AUTHOR: v+


# {1} LIBRARIES
#/{1} LIBRARIES


# {2} CODE
	# ------------------------------------------------------------------------------ #
	# {{ 1 }}
	## converts from bed file to GRanges object
	BedToGRanges = function(bed, values=FALSE, seqlen=NULL){
  
		library(stringr)
		library(GenomicRanges)
		bed = bed[order(bed[,1], bed[,2]),]
  
		if(sum(str_detect(names(bed), 'strand')) == 1){
			strand=as.character(bed[,str_detect(names(bed), 'strand')])
    
			if(!all(strand %in% c('+','-','*')))
				stop('unallowed strand character found')
		}else{
			strand='*'
		}
  
  
		ranges = GRanges(
					seqnames=as.character(bed[,1]), 
					ranges=IRanges(start=as.numeric(bed[,2]), end=as.numeric(bed[,3])),
					strand=strand,
					.id=1:nrow(bed))
 
		if(!is.null(seqlen)){
			gseqlev=GRanges(names(seqlen), IRanges(1, seqlen))
			ranges = subsetByOverlaps(ranges,gseqlev, type='within')
			seqlevels(ranges) = names(seqlen)
			seqlengths(ranges) = seqlen
		}
  
		if(values == TRUE && ncol(bed) > 3){
			col.ids = setdiff(names(bed), c( "seqnames", "ranges", "strand", "seqlengths", "start", "end", "width",  "element", "chr"))
			values(ranges) = bed[values(ranges)$'.id',col.ids]
		}
		values(ranges)$'.id' = NULL
  
		return(ranges)
	}
	
	
	# ------------------------------------------------------------------------------ #
	# {{ 2 }}
	## converts from granges to bed
	GRangesToBed = function(ranges, values=F){
	

		bed = data.frame(
						 chr=as.character(seqnames(ranges)), 
						 start=start(ranges),
						 end=end(ranges), 
						 strand=as.character(strand(ranges))
						 )
		
		if(values == T){
			print('This is going to be slooow...')
			v = as.list(values(ranges))
			v = lapply(v, as.character)
			v = do.call(cbind, v)
			bed = cbind(bed, v)
		}
		attr(bed, 'seqlengts') = seqlengths(ranges)
		return(bed)
	}

	
	
	# ------------------------------------------------------------------------------ #
	# converts a list of characters to a dataframe
	CharList2DF = function(cl, sep="\t"){
	
		l = strsplit(cl, sep)
		if(! length(unique(sapply(l, length))) == 1)
			stop("elements do not have the same length")
		data.frame(do.call(rbind, l))
	}
	
	
	# ------------------------------------------------------------------------------ #
	#formats the ensembl gene files
	FormatEnsembl = function(file, outpath=NULL, outname=NULL, add.chr=TRUE){
	
		header=c('Chromosome Name'	= 'chr',
				 'Gene Start (bp)'	= 'start',
				 'Gene End (bp)'	= 'end',
				 'Exon Chr Start (bp)' = 'ex.start',
				 'Exon Chr End (bp)' = 'ex.end',
				 'Constitutive Exon' = 'const.ex',
				 'Exon Rank in Transcript' = 'ex.rank',
				 'phase' = 'phase',
				 'cDNA coding start' = 'cdna.cod.start',
				 'cDNA coding end' = 'cdna.cod.end',
				 'Genomic coding start' = 'gen.cod.start',
				 'Genomic coding end' = 'gen.cod.end',
				 'Ensembl Exon ID' = 'ens.ex.id',
				 'Strand'			= 'strand',
				 'Ensembl Gene ID'	= 'ens.gene.id',
				 'Ensembl Transcript ID'	= 'ens.trans.id',
				 'Associated Gene Name'		= 'gene.name',
				 'Gene Biotype'				= 'gene.biotype',
				 'Transcript Biotype'		= 'trans.biotype')
		
		cat('Reading in the table...\n')
		if(file.exists(file)){
			d = read.table(file, header=F,sep='\t')
		}else{
			stop('the designated file does not exist')
		}
		if(any(!d[1,] %in% names(header)))
			stop('some of the header names are not present in the lib')
		
		cat('Formatin...\n')
		colnames(d) = header[unlist(d[1,])]
		d = d[-1,]
		if(any(d$chr == 'MT'))
			d$chr[d$chr == 'MT'] = 'M'
		if(add.chr == TRUE){
			ind = str_detect(d$chr, '^\\d+') | d$chr %in% c('X','Y','M')
			d$chr = paste('chr',d$chr,sep='')
		}
		if(sum(str_detect(colnames(d), 'strand')) == 1)
			d$strand = ifelse(d$strand == 1, '+', '-')
		if(sum(str_detect(colnames(d), 'strand')) > 1)
			stop('there is more than 1 strand column')
		
		### sets the start/end colnames if there are no appropriate columns
		### gene -> transcript -> exon priorities
		if(all(colnames(d) != 'start' & colnames(d) != 'end')){
			
			w = c(which(colnames(d) == 'tr.start'),
				  which(colnames(d) == 'tr.end'))
			if(length(w) == 0){
				w = c(which(colnames(d) == 'ex.start'),
					  which(colnames(d) == 'ex.end'))
			}
			if(length(w) == 0)
				stop('there are no appropriate start and end columns')
			
			colnames(d)[w] = c('start','end')
		}
		
		pos.ind = na.omit(match(c('chr','start','end','strand'),colnames(d)))
		id.ind = na.omit(match(c('ens.gene.id','ens.trans.id','ens.ex.id'),colnames(d)))
		d.new = data.frame(d[,pos.ind], d[,id.ind], d[,-c(pos.ind,id.ind)])
		
		cat('Savin...\n')
		if(is.null(outpath) & is.null(outname))
			cat('Will overwrite the original file...\n')
		if(is.null(outpath))
			outpath=dirname(file)
		if(is.null(outname))
			outname=basename(file)
		
		
		outfile=file.path(outpath, outname)
		write.table(d.new, outfile, row.names=F, col.names=T, quote=F, sep='\t')
		cat('\n')
	}
	
	
	
	
#/{2} CODE
frenkiboy/MyLib documentation built on Oct. 24, 2020, 11:01 a.m.