R/standalonefunctions.R

#standAloneBuildingLocalAnnotation a stand alone function to create gene-level annotation for Affymetrix exon-arrays
#standAloneAddingAnnotation a stand alone function to add gene-level annotaiton to a data frame, having a column made with gene-level ids
 
#standAloneBuildingLocalAnnotation(libDirLocation = getwd(), netaffxUser = "myemail@somewhere.org", netaffxUserPw = "yourpassword", whichAnnotation = "HuEx")
#building gene and exon annotation for oneChannelGUI 
#ncScaffold build scaffold for ncRNA using bioMart this function create the scaffold used to load ncRNA-seq data if data are mapped only on ncRNA subset
#makeGeneScaffold creating Grange object for UCSC browser needed to map reads on UCSC genes 
#makeGCcontent creating a list of GC frequences associated to the genes defined by makeGeneScaffold
#wrapScaffold  wraping the scaffold and gc frequencies in an Rda object named with the genome reference
################################################################################
"standAloneBuildingLocalAnnotation" <- function(libDirLocation = getwd(), netaffxUser = "myemail@somewhere.org", netaffxUserPw = "yourpassword", whichAnnotation = c("HuEx", "MoEx", "RaEx")){
#    require("AffyCompatible") || stop("\nMissing AffyCompatible library\n")
    #where to save the objects containing the annotation
    libDirLocation <- libDirLocation
    #netaffx user email
    username.tmp <- netaffxUser
    #netaffx user password
    userpw.tmp <- netaffxUserPw
         rsrc <- NetAffxResource(user = username.tmp, password = userpw.tmp, affxLicence = "BIOCON0808")
         rsrc
         ReturnNetaffx <- whichAnnotation
         if(ReturnNetaffx != ""){
               annos <- rsrc[[grep(ReturnNetaffx, names(rsrc))]]
               #gene level
               anno <- affxAnnotation(annos)[[grep("Transcript Cluster Annotations, CSV", affxDescription(rsrc[[grep(ReturnNetaffx, names(rsrc))]]))]]   #"transcript clusters Annotation, CSV Format"
               df <- readAnnotation(rsrc, annotation = anno, content=FALSE)
               df1 <- as.character(unlist(strsplit(df, ".zip")))
               df1 <- as.character(unlist(strsplit(df1,'/')))
               df1 <- df1[length(df1)]
               conn <- unz(df, df1)
               tmp <- readLines(conn, n=500)
               myskip <- grep("transcript_cluster_id", tmp)
               df.ann <- read.csv(conn, skip= (myskip - 1), header=T, as.is=T)
               df.ann <- df.ann[,which(names(df.ann)%in%c("probeset_id", "gene_assignment"))]
               #IMPORTANT to build the GENE level oneChannelGUI annotation file I use only the first assignment mapped on the gen_assignment field in the annotation files
               geneAnnExtraction <- function(x){
                                                 mx <- strsplit(x, '//')
                                                 mx <- as.vector(unlist(mx))
                                                 if(length(mx) >= 4){
                                                       mx <- mx[1:4]
                                                       mxtmp <- gsub(" ", "", mx[c(1:2,4)])
                                                       mx <- c(mxtmp[1:2], mx[3], mxtmp[3])
                                                       return(mx)
                                                 } else{
                                                     mx <- rep(NA, 4)
                                                     return(mx)
                                                 }
                                    }
               myann <- sapply(df.ann[,2], geneAnnExtraction)
               myann <- t(myann)
               rownames(myann) <- df.ann[,1]
               df.ann <- cbind(df.ann[,1], myann)
               df.ann <- as.data.frame(df.ann)
               names(df.ann) <- c("PROBESETID","ACC","SYMBOL","DESCRIPTION","CYTOBAND")
               }
               ####
               if(ReturnNetaffx == "HuEx"){
                       huex.annotation <- df.ann
                       save(huex.annotation, file="huex.annotation.rda")
                       cat("\nHuEx gene level annotation is saved in: ", getwd()," as huex.annotation.rda\n")
                       return(paste(getwd(),"/huex.annotation.rda", sep=""))
               }
               if(ReturnNetaffx == "MoEx"){
                       moex.annotation <- df.ann
                       save(moex.annotation, file="moex.annotation.rda")
                       cat("\nMoEx gene level annotation is saved in: ", getwd()," as moex.annotation.rda\n")
                       return(paste(getwd(),"/moex.annotation.rda", sep=""))
               }
               if(ReturnNetaffx == "RaEx"){
                       raex.annotation <- df.ann
                       save(raex.annotation, file="raex.annotation.rda")
                       cat("\nRaEx gene level annotation is saved in: ", getwd()," as raex.annotation.rda\n")
                       return(paste(getwd(),"/raex.annotation.rda", sep=""))
               }
         
}
#############################################################################################################
#data(file="huex.annotation",package="oneChannelGUI")
#mydf <- read.table("tmpTopTable.txt", sep="\t", header=T)
#annotated.df <- standAloneAddingAnnotation(huex.annotation, mydf, ids.column = 1)
#shrimpReformat reformat the output for shrimp in a way to be used by oneChannelGUI
"standAloneAddingAnnotation" <- function(annotationdf, df.tobe.annotated, ids.column){
      df.tobe.annotated <- df.tobe.annotated[order(as.character(df.tobe.annotated[,1])),]
      myids <- as.character(df.tobe.annotated[,ids.column])
      myann <- annotationdf[which(as.character(annotationdf$PROBESETID)%in%myids),]
      myann <- myann[order(as.character(myann$PROBESETID)),]
      if(identical(as.character(myann$PROBESETID), myids)){
         dfann <- cbind(myann, df.tobe.annotated[,setdiff(seq(1, dim(df.tobe.annotated)[2]), ids.column)])
         return(dfann)
      } else{
        cat("\nIDs from dataframe and those, selected from annotation table, seems to be not identical.\nAnnotation cannot be attached to the data frame!\n")
        return()
      }
}
################################################################################
"ncScaffold" <- function(genome, fasta =TRUE){
       #    require(biomaRt) || stop("library biomaRt could not be found !")
         #  require(Biostrings) || stop("\nBiostrings package is missing\n")
           ensembl <- useMart('ensembl')
           if(genome=="hg19"){
                 ensembl <- useDataset('hsapiens_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 25)
                 chrs.names <- c(chrs[1:22], "X", "Y", "MT")
                 emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_transcript_id','transcript_exon_intron', 'chromosome_name', 'start_position', 'end_position', 'strand'), filters = 'biotype', values = c("miRNA","Mt_rRNA","Mt_tRNA","rRNA","snoRNA","snRNA"), mart = ensembl) 
                 #only the set of nc associated to canonical chr are selected
                 p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                 n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                 peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$start_position, "-", p.set$end_position, "." ,p.set$ensembl_transcript_id, sep="")
                 peaksP <- IRanges(start = p.set$start_position, end = p.set$end_position, names = peaksNames.p)
                 
                 peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$start_position, "-", n.set$end_position, "." ,n.set$ensembl_transcript_id, sep="")
                 peaksN <- IRanges(start = n.set$start_position, end = n.set$end_position, names = peaksNames.n)
                 ncHs.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                 save(ncHs.data, file="ncHs.data.rda")
                 #I need also to retrieve the fasta seq
                 if(fasta){
                   p.seq <- apply(p.set, 1, function(x) x[1])
                   p.seq <- cbind(peaksNames.p, p.seq)
                   p.fa <- apply(p.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   n.seq <- apply(n.set, 1, function(x) x[1])
                   n.seq <- cbind(peaksNames.n, n.seq)
                   n.fa <- apply(n.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   writeFASTA(p.fa, file="ncHs.fa", append=F, width=80)
                   writeFASTA(n.fa, file="ncHs.fa", append=T, width=80)
                 }
                 hsfa <- readFASTA("ncHs.fa")
                 save(hsfa, file="ncHs.rda", ascii=T)
          } else if(genome=="mm9"){
                 ensembl <- useDataset('mmusculus_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 22)
                 chrs.names <- c(chrs[1:19], "X", "Y", "MT")
                 emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_transcript_id','transcript_exon_intron', 'chromosome_name', 'start_position', 'end_position', 'strand'), filters = 'biotype', values = c("miRNA","Mt_rRNA","Mt_tRNA","rRNA","snoRNA","snRNA"), mart = ensembl) 
                 #only the set of nc associated to canonical chr are selected
                 p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                 n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                 peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$start_position, "-", p.set$end_position, "." ,p.set$ensembl_transcript_id, sep="")
                 peaksP <- IRanges(start = p.set$start_position, end = p.set$end_position, names = peaksNames.p)
                 
                 peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$start_position, "-", n.set$end_position, "." ,n.set$ensembl_transcript_id, sep="")
                 peaksN <- IRanges(start = n.set$start_position, end = n.set$end_position, names = peaksNames.n)
                 ncMm.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                 save(ncMm.data, file="ncMm.data.rda")
                 #I need also to retrieve the fasta seq
                 if(fasta){
                   p.seq <- apply(p.set, 1, function(x) x[1])
                   p.seq <- cbind(peaksNames.p, p.seq)
                   p.fa <- apply(p.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   n.seq <- apply(n.set, 1, function(x) x[1])
                   n.seq <- cbind(peaksNames.n, n.seq)
                   n.fa <- apply(n.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   writeFASTA(p.fa, file="ncMm.fa", append=F, width=80)
                   writeFASTA(n.fa, file="ncMm.fa", append=T, width=80)
                 }
                 mmfa <- readFASTA("ncMm.fa")
                 save(mmfa, file="ncMm.rda", ascii=T)

          } else if(genome=="rn4"){
                 ensembl <- useDataset('rnorvegicus_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 21)
                 chrs.names <- c(chrs[1:19], "X", "MT")
                 emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_transcript_id','transcript_exon_intron', 'chromosome_name', 'start_position', 'end_position', 'strand'), filters = 'biotype', values = c("miRNA","Mt_rRNA","Mt_tRNA","rRNA","snoRNA","snRNA"), mart = ensembl) 
                 #only the set of nc associated to canonical chr are selected
                 p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                 n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                 peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$start_position, "-", p.set$end_position, "." ,p.set$ensembl_transcript_id, sep="")
                 peaksP <- IRanges(start = p.set$start_position, end = p.set$end_position, names = peaksNames.p)
                 
                 peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$start_position, "-", n.set$end_position, "." ,n.set$ensembl_transcript_id, sep="")
                 peaksN <- IRanges(start = n.set$start_position, end = n.set$end_position, names = peaksNames.n)
                 ncRn.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                 save(ncRn.data, file="ncRn.data.rda")
                 #I need also to retrieve the fasta seq
                 if(fasta){
                   p.seq <- apply(p.set, 1, function(x) x[1])
                   p.seq <- cbind(peaksNames.p, p.seq)
                   p.fa <- apply(p.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   n.seq <- apply(n.set, 1, function(x) x[1])
                   n.seq <- cbind(peaksNames.n, n.seq)
                   n.fa <- apply(n.seq, 1,function(x) {
                                           n <- as.character(x[1])
                                           s <- as.character(x[2])
                                           tmp.list <- list("desc"= n, "seq"=s)
                   })
                   writeFASTA(p.fa, file="ncRn.fa", append=F, width=80)
                   writeFASTA(n.fa, file="ncRn.fa", append=T, width=80)
                 }
                 rnfa <- readFASTA("ncRn.fa")
                 save(rnfa, file="ncRn.rda", ascii=T)

         } else{
             cat("\nYou have selected a genome which is not implemented. \nImplemented genomes are:\n\thg19\n\tmm9\n\trn4\n")
         }
         return()
}
##################################################################################
"exonScaffold" <- function(genome){
       #    require(biomaRt) || stop("library biomaRt could not be found !")
      #     require(Biostrings) || stop("\nBiostrings package is missing\n")
           ensembl <- useMart('ensembl')
           if(genome=="hg19"){
                 ensembl <- useDataset('hsapiens_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 25)
                 chrs.names <- c(chrs[1:22], "X", "Y", "MT")
                    emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_exon_id','chromosome_name', 'exon_chrom_start', 'exon_chrom_end', 'strand', 'rank'), mart = ensembl)     
                    #plus and minus sted are organized separately
                    p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                    n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                    peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$exon_chrom_start, "-", p.set$exon_chrom_end, ".", p.set$ensembl_gene_id, "." ,p.set$ensembl_exon_id, ".", p.set$rank, sep="")
                    peaksP <- IRanges(start = p.set$exon_chrom_start, end = p.set$exon_chrom_end, names = peaksNames.p)
                 
                    peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$exon_chrom_start, "-", n.set$exon_chrom_end, "." ,n.set$ensembl_gene_id, "." ,n.set$ensembl_exon_id, ".", n.set$rank, sep="")
                    peaksN <- IRanges(start = n.set$exon_chrom_start, end = n.set$exon_chrom_end, names = peaksNames.n)
                    exonHs.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                    save(exonHs.data, file="exonHs.data.rda", ascii=T)
          } else if(genome=="mm9"){
                 ensembl <- useDataset('mmusculus_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 22)
                 chrs.names <- c(chrs[1:19], "X", "Y", "MT")
                    emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_exon_id','chromosome_name', 'exon_chrom_start', 'exon_chrom_end', 'strand', 'rank'), mart = ensembl)     
                    #only the set of nc associated to canonical chr are selected
                    p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                    n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                    peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$exon_chrom_start, "-", p.set$exon_chrom_end, "." ,p.set$ensembl_transcript_id, sep="")
                    peaksP <- IRanges(start = p.set$exon_chrom_start, end = p.set$exon_chrom_end, names = peaksNames.p)
                 
                    peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$exon_chrom_start, "-", n.set$exon_chrom_end, "." ,n.set$ensembl_transcript_id, sep="")
                    peaksN <- IRanges(start = n.set$exon_chrom_start, end = n.set$exon_chrom_end, names = peaksNames.n)
                    exonMm.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                    save(exonMm.data, file=paste("chr",chrs.names,"exonMm.data.rda", sep="."))
          } else if(genome=="rn4"){
                 ensembl <- useDataset('rnorvegicus_gene_ensembl', mart=ensembl)
                 chrs <- seq(1, 21)
                 chrs.names <- c(chrs[1:19], "X", "MT")
                    emblInfo <- getBM(attributes = c('ensembl_gene_id','ensembl_exon_id','chromosome_name', 'exon_chrom_start', 'exon_chrom_end', 'strand', 'rank'), mart = ensembl)     
                    #only the set of nc associated to canonical chr are selected
                    p.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "1")),]
                    n.set <- emblInfo[intersect(which(emblInfo$chromosome_name %in% chrs.names),which(emblInfo$strand == "-1")),]

                    peaksNames.p <- paste("chr", p.set$chromosome_name, ".", "plus", ".", p.set$exon_chrom_start, "-", p.set$exon_chrom_end, "." ,p.set$ensembl_transcript_id, sep="")
                    peaksP <- IRanges(start = p.set$exon_chrom_start, end = p.set$exon_chrom_end, names = peaksNames.p)
                 
                    peaksNames.n <- paste("chr", n.set$chromosome_name, ".", "minus", ".", n.set$exon_chrom_start, "-", n.set$exon_chrom_end, "." ,n.set$ensembl_transcript_id, sep="")
                    peaksN <- IRanges(start = n.set$exon_chrom_start, end = n.set$exon_chrom_end, names = peaksNames.n)
                    exonRn.data <- list("peaksP" = peaksP, "peaksN" = peaksN)
                    save(exonRn.data, file=paste("chr",chrs.names,"exonMm.data.rda", sep="."))
         } else{
             cat("\nYou have selected a genome which is not implemented. \nImplemented genomes are:\n\thg19\n\tmm9\n\trn4\n")
         }
         return()
}

################################################################################
"makeGeneScaffold" <- function(whichRef = c("hg19", "mm9", "rn4")){
     Try(cat("\nStart creating the gene-level scaffold for reads mapping....\nBe patient!\n"))
 #    Try(require(rtracklayer) || stop("\nMissing library rtracklayer\n"))
     Try(data("chrLength", package="oneChannelGUI"))
     if(whichRef=="hg19"){
        Try(chr.length <- chrLength$hg19[which(names(chrLength$hg19)!="MT")])
        Try(session <- browserSession())
        Try(genome(session) <- "hg19")
    #   Try(trackNames(session))   #show the available tables
        # UCSC genes knownGene
	    Try(query <- ucscTableQuery(session, "knownGene"))
	    Try(tableName(query)  <- "knownToLocusLink")
	    Try(myEGtable <- getTable(query))
	    Try(chr.eg <- list())
        Try(chr.gr <- list())
        for( i in 1:length(chr.length)){
               Try(mychr <- paste("chr", names(chr.length)[i], sep=""))
               Try(query <- ucscTableQuery(session, "knownGene", GRangesForUCSCGenome("hg19", mychr, IRanges(1, as.numeric(chr.length[i])))))
               Try(tableName(query)  <- "knownGene")   #indicates which table I am interested
               Try(mytable <- getTable(query))
               Try(mytable <- mytable[which(mytable$proteinID!=""),]) #keeping only genes associated to proteins
               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$proteinID))),]) #since analysis at gene level I keep only one of the duplicated transcript isoforms
               #associating eg to uscs transcripts names
               Try(myeg <- myEGtable[which(myEGtable$name%in%mytable$name),])
               Try(mytable <- mytable[which(mytable$name%in%myeg$name),])
               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$name))),])
               Try(mytable <- mytable[order(as.character(mytable$name)),])
               Try(myeg <- myeg[order(as.character(myeg$name)),])
               #identical(as.character(mytable$name), as.character(myeg$name))
               #creating a gene-level grange object
               Try(mygr <- GRanges(seqnames = Rle(rep(mychr, dim(mytable)[1])), ranges = IRanges(start=mytable$txStart, end=mytable$txEnd), strand = Rle(as.character(mytable$strand)), names=as.character(mytable$proteinID)))
               Try(myeg <- cbind(myeg, elementMetadata(mygr)$names))
               Try(names(myeg)<-c("names","EG","proteinID"))
               Try(chr.gr[[mychr]] <- mygr)
               Try(chr.eg[[mychr]] <- myeg)
       }
       
      }else if(whichRef == "mm9"){
	        Try(chr.length <- chrLength$mm9[which(names(chrLength$mm9)!="MT")])
	        Try(session <- browserSession())
	        Try(genome(session) <- "mm9")
	    #   Try(trackNames(session))   #show the available tables
	        # UCSC genes knownGene
		    Try(query <- ucscTableQuery(session, "knownGene"))
		    Try(tableName(query)  <- "knownToLocusLink")
		    Try(myEGtable <- getTable(query))
		    Try(chr.eg <- list())
	        Try(chr.gr <- list())
	        for( i in 1:length(chr.length)){
	               Try(mychr <- paste("chr", names(chr.length)[i], sep=""))
	               Try(query <- ucscTableQuery(session, "knownGene", GRangesForUCSCGenome("mm9", mychr, IRanges(1, as.numeric(chr.length[i])))))
	               Try(tableName(query)  <- "knownGene")   #indicates which table I am interested
	               Try(mytable <- getTable(query))
	               Try(mytable <- mytable[which(mytable$proteinID!=""),]) #keeping only genes associated to proteins
	               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$proteinID))),]) #since analysis at gene level I keep only one of the duplicated transcript isoforms
	               #associating eg to uscs transcripts names
	               Try(myeg <- myEGtable[which(myEGtable$name%in%mytable$name),])
	               Try(mytable <- mytable[which(mytable$name%in%myeg$name),])
	               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$name))),])
	               Try(mytable <- mytable[order(as.character(mytable$name)),])
	               Try(myeg <- myeg[order(as.character(myeg$name)),])
	               #identical(as.character(mytable$name), as.character(myeg$name))
	               #creating a gene-level grange object
	               Try(mygr <- GRanges(seqnames = Rle(rep(mychr, dim(mytable)[1])), ranges = IRanges(start=mytable$txStart, end=mytable$txEnd), strand = Rle(as.character(mytable$strand)), names=as.character(mytable$proteinID)))
	               Try(myeg <- cbind(myeg, elementMetadata(mygr)$names))
	               Try(names(myeg)<-c("names","EG","proteinID"))
	               Try(chr.gr[[mychr]] <- mygr)
	               Try(chr.eg[[mychr]] <- myeg)
       }
      }else if(whichRef=="rn4"){
	        Try(chr.length <- chrLength$rn4[which(names(chrLength$rn4)!="MT")])
	        Try(session <- browserSession())
	        Try(genome(session) <- "rn4")
	    #   Try(trackNames(session))   #show the available tables
	        # UCSC genes knownGene
		    Try(query <- ucscTableQuery(session, "knownGene"))
		    Try(tableName(query)  <- "knownToLocusLink")
		    Try(myEGtable <- getTable(query))
		    Try(chr.eg <- list())
	        Try(chr.gr <- list())
	        for( i in 1:length(chr.length)){
	               Try(mychr <- paste("chr", names(chr.length)[i], sep=""))
	               Try(query <- ucscTableQuery(session, "knownGene", GRangesForUCSCGenome("rn4", mychr, IRanges(1, as.numeric(chr.length[i])))))
	               Try(tableName(query)  <- "knownGene")   #indicates which table I am interested
	               Try(mytable <- getTable(query))
	               Try(mytable <- mytable[which(mytable$proteinID!=""),]) #keeping only genes associated to proteins
	               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$proteinID))),]) #since analysis at gene level I keep only one of the duplicated transcript isoforms
	               #associating eg to uscs transcripts names
	               Try(myeg <- myEGtable[which(myEGtable$name%in%mytable$name),])
	               Try(mytable <- mytable[which(mytable$name%in%myeg$name),])
	               Try(mytable <- mytable[setdiff(seq(1,dim(mytable)[1]), which(duplicated(mytable$name))),])
	               Try(mytable <- mytable[order(as.character(mytable$name)),])
	               Try(myeg <- myeg[order(as.character(myeg$name)),])
	               #identical(as.character(mytable$name), as.character(myeg$name))
	               #creating a gene-level grange object
	               Try(mygr <- GRanges(seqnames = Rle(rep(mychr, dim(mytable)[1])), ranges = IRanges(start=mytable$txStart, end=mytable$txEnd), strand = Rle(as.character(mytable$strand)), names=as.character(mytable$proteinID)))
	               Try(myeg <- cbind(myeg, elementMetadata(mygr)$names))
	               Try(names(myeg)<-c("names","EG","proteinID"))
	               Try(chr.gr[[mychr]] <- mygr)
	               Try(chr.eg[[mychr]] <- myeg)
	      }
     }
     Try(cat("\nEnd creating the gene-level scaffold for reads mapping\n"))
     Try(output <- list("chr.gr"=chr.gr, "chr.eg"=chr.eg))
     Try(return(output))
}

################################################################################
#scaffold is the GRange obj containing the position on chrs for each of the genes
#whichRef refers to the genome of interest
"makeGCcontent" <- function(scaffold, whichRef = c("hg19", "mm9", "rn4")){
	Try(chr.gr <- scaffold)
    Try(cat("\nStart creating the scaffold GC frequency \n"))
     Try(cat("\nEnd creating the scaffold GC frequency \n"))

    if(whichRef == "hg19"){
          Try(library("BSgenome.Hsapiens.UCSC.hg19"))
    } else if(whichRef == "mm9"){
          Try(library("BSgenome.Mmusculus.UCSC.mm9"))
    } else if(whichRef == "rn4"){
          Try(library("BSgenome.Rnorvegicus.UCSC.rn4"))
    }
    Try(gc.freq <- list())
    if(whichRef == "hg19"){
         #scanning over chrs
         for(i in names(chr.gr)){
         #extract for a specific chr only the ranges of the genes from the GRange obj
             Try(IRanges.tmp <- ranges(chr.gr[[i]]))
             Try(tmpseq <- getSeq(Hsapiens, i, start=start(IRanges.tmp), end=end((IRanges.tmp))))
             Try(tmpfreq <- alphabetFrequency(tmpseq, as.prob=T,  baseOnly=T))
             #sum GC freq
             Try(gcfreq <- apply(tmpfreq[,2:3], 1, sum))
             Try(names(gcfreq) <- elementMetadata(chr.gr[[i]])$names)
             Try(gc.freq[[i]] <- gcfreq)
         }
    } else if(whichRef == "mm9"){
         #scanning over chrs
         for(i in names(chr.gr)){
         #extract for a specific chr only the ranges of the genes from the GRange obj
             Try(IRanges.tmp <- ranges(chr.gr[[i]]))
             Try(tmpseq <- getSeq(Mmusculus, i, start=start(IRanges.tmp), end=end((IRanges.tmp))))
             Try(tmpfreq <- alphabetFrequency(tmpseq, as.prob=T,  baseOnly=T))
             #sum GC freq
             Try(gcfreq <- apply(tmpfreq[,2:3], 1, sum))
             Try(names(gcfreq) <- elementMetadata(chr.gr[[i]])$names)
             Try(gc.freq[[i]] <- gcfreq)
         }
    } else if(whichRef == "rn4"){
         #scanning over chrs
         for(i in names(chr.gr)){
         #extract for a specific chr only the ranges of the genes from the GRange obj
             Try(IRanges.tmp <- ranges(chr.gr[[i]]))
             Try(tmpseq <- getSeq(Rnorvegicus, i, start=start(IRanges.tmp), end=end((IRanges.tmp))))
             Try(tmpfreq <- alphabetFrequency(tmpseq, as.prob=T,  baseOnly=T))
             #sum GC freq
             Try(gcfreq <- apply(tmpfreq[,2:3], 1, sum))
             Try(names(gcfreq) <- elementMetadata(chr.gr[[i]])$names)
             Try(gc.freq[[i]] <- gcfreq)
         }
    }
    Try(return(gc.freq))
}
################################################################################
"wrapScaffold" <- function(whichRef = c("hg19", "mm9", "rn4")){
	 Try(whichRef <- whichRef)
     Try(tmp <- makeGeneScaffold(as.character(whichRef)))
     Try(chr.gr <- tmp[[1]])
     Try(chr.eg <- tmp[[2]])
     Try(gc.freq <- makeGCcontent(scaffold=chr.gr, as.character(whichRef))) 
     Try(save(chr.gr, chr.eg, gc.freq, file=paste(whichRef,"_gene-level.scaffold.Rda",sep="")))
}


################################################################################

Try the oneChannelGUI package in your browser

Any scripts or data that you put into this service are public.

oneChannelGUI documentation built on Nov. 17, 2017, 11:02 a.m.