R/genericFunctions.R

Defines functions .makeNullProfile .makeUniformModel .sampleUnifEntropyL .sampleUnifEntropyL.old .profileEntropy .profileDensity .shannon .MAD .alnWeights .getLocalGeneMutations .getGeneMutations .scoreMatrix .clustalMatrix .filterMAlign .clustalOAlign .Trident_Score .checkGene_to_geneID showTumorType .PerlModuleChecks .ClustalChecks .findPerl

Documented in showTumorType

## for motifStack package
.globals <- new.env(parent = emptyenv())

#######
#### functions hidden to the user
####################################

.findPerl <- function(perl, verbose = "FALSE"){
  errorMsg <- "perl executable not found. Use perl= argument to specify the correct path." 
  if (missing(perl))
    {
      perl = "perl"
    }
  perl = Sys.which(perl)
  if (perl=="" || perl=="perl")
    stop(errorMsg)
  if (.Platform$OS == "windows") {
    if (length(grep("rtools", tolower(perl))) > 0) {
      perl.ftype <- shell("ftype perl", intern = TRUE)
      if (length(grep("^perl=", perl.ftype)) > 0) {
        perl <- sub('^perl="([^"]*)".*', "\\1", perl.ftype)
      }
    }
      }
  if (verbose) cat("Using perl at", perl, "\n")
  perl
}

.ClustalChecks <- function(ClustalCommand="clustalo") {
    #Check for clustalo in the PATH
    message("Checking if clustalo is in the PATH...")
    ClustalCommand <- Sys.which(ClustalCommand)
    if(ClustalCommand=="") {
        warning("Clustal Omega is not in the PATH:\nYou can either change clustalo command using lmParams function or use the web service. See ?setup")
        return()
    }
    message("Checking clustalo Version...")
    ClustalVersion <- system(paste(ClustalCommand ,  "--version") , intern=TRUE)
    if(!grepl("^1.2" , ClustalVersion))
        warning("Clustal Omega version could be not compatible. LowMACA was tested on 1.2.x")
}
.PerlModuleChecks <- function(stop=FALSE , perl="perl"){
    if(stop)
        myFunc <- stop
    else
        myFunc <- warning
    message("Checking perl installation...")
    perl <- tryCatch(.findPerl(perl=perl) , error=function(e) "no perl")
    if(perl=="no perl"){
        warning("perl executable is not in the PATH. Remember to install perl and its modules XML::Simple and LWP if you want to use web service aligner.")
        return()
    }
    message("Checking perl modules XML::Simple and LWP...")
    checkXML <- system( paste(perl , "-MXML::Simple -e 1") , intern=TRUE)
    failed <- !is.null(attr(checkXML, 'status')) && attr(checkXML, 'status') != 0
    if(failed)
        myFunc(paste( "XML::Simple module for perl is not installed. 
            If you don't want to install a local clustal omega and use the web service, XML::Simple is required", checkXML , sep="\n"))
    perl <- .findPerl()
    checkLWP <- system( paste(perl , "-MLWP -e 1") , intern=TRUE)
    failed <- !is.null(attr(checkLWP, 'status')) && attr(checkLWP, 'status') != 0
    if(failed)
        myFunc(paste( "LWP module for perl is not installed. 
            If you don't want to install a local clustal omega and use the web service, LWP is required", checkLWP , sep="\n"))
}
.myTrimmer <- function (object, ...) {
   s <- sub("^[\t\n\f\r ]*", "", as.character(object))
   s <- sub("[\t\n\f\r ]*$", "", s)
   s
}

showTumorType <- function() {
    # mycgds <- cgdsr::CGDS("http://www.cbioportal.org/")
    # all_cancer_studies <- cgdsr::getCancerStudies(mycgds)[,c(1,2)]
    mycgds <- cBioPortalData::cBioPortal(
      hostname = "www.cbioportal.org",
      protocol = "https",
      api. = "/api/api-docs")
    all_cancer_studies <- cBioPortalData::getStudies(mycgds)
    all_cancer_studies2 <- unique(
        data.frame(
            # Code=sapply(all_cancer_studies$cancer_study_id
            #     , function(x) strsplit(x , "_")[[1]][1])
            Code=all_cancer_studies$cancerTypeId
            , Full_Name=sapply(all_cancer_studies$name
                , function(x) .myTrimmer(strsplit(x , "\\(")[[1]][1]))
                ))
    all_cancer_studies3 <- aggregate(Full_Name~Code, all_cancer_studies2
        , FUN=function(x) {paste(x , collapse="|")})
    out <- as.character(all_cancer_studies3$Code)
    names(out) <- all_cancer_studies3$Full_Name
    return(out)
}

.checkGene_to_geneID <- function(genes, myUni, myAlias) {
# This function checks the gene ids provided by the user
# transforming eventual aliases to official HugoSymbols,
# removing duplicated itmes and returning the corresponding EntrezID
    myAliasUnmapped <- myAlias[ myAlias$MappedByLowMACA=="no" , ]
    myAlias <- myAlias[ myAlias$MappedByLowMACA=="yes" , ]
    # make all symbols uppercase in order to avoid
    # ambiguities
    genes <- toupper(genes)
    # good genes are considered when provided as EntrezID or HugoSymbols
    good_genes <- c(as.character(myUni$Gene_Symbol), 
        as.character(myUni$Entrez))
    if( all(genes %in% good_genes) ) {
        message("All Gene Symbols correct!")
        # collect the annotations of id provided as gene symbols
        isGeneSymbol <- myUni$Gene_Symbol %in% genes
        Official_gs <- myUni[isGeneSymbol, c("Gene_Symbol", "Entrez")]
        # collect the annotations of id provided as EntrezID
        isEntrez <- as.character(myUni$Entrez) %in% genes
        Official_entrez <- myUni[isEntrez, c("Gene_Symbol", "Entrez")]
        # merge the two annotations
        Official <- rbind(Official_gs, Official_entrez)
        Official$Alias <- rep(NA, nrow(Official))
        # check for duplicated itmes, in case there are make a
        # warning with the duplicated items
        if( any(duplicated(Official$Entrez)) ) 
        {
            warning("Either there were duplicated Gene Symbols or Entrez IDs 
                or you put a Gene Symbol along with its Entrez ID:"
              , immediate.=TRUE)
            message(Official[duplicated(Official$Entrez), ])
        }
        # remove duplicated items, drop the factors' levels 
        # and return
        Official <- unique(Official[, c("Gene_Symbol", "Entrez")])
        Official <- droplevels(Official)
        return(Official)
    } else {
        # collect the annotations of id provided as gene symbols
        isGeneSymbol <- myUni$Gene_Symbol %in% genes
        Official_gs <- myUni[isGeneSymbol, c("Gene_Symbol", "Entrez")]
        # collect the annotations of id provided as EntrezID
        isEntrez <- as.character(myUni$Entrez) %in% genes
        Official_entrez <- myUni[isEntrez, c("Gene_Symbol", "Entrez") ]
        # merge the two annotations
        Official <- rbind(Official_gs, Official_entrez)
        Official$Alias <- rep(NA, nrow(Official))
        # find genes who were not provided as either EntrezID or HugoSymbol
        notOfficial <- setdiff(genes, good_genes)
        # try to assign them through alias
        bad_alias <- setdiff(notOfficial, myAlias$Alias)
        if( length(bad_alias)==0 ) 
        {
            notOfficial <- lapply(notOfficial, function(x) {
                gs <- unique(myAlias[myAlias$Alias==x,"Official_Gene_Symbol"])
                isGeneSymbol <- myUni$Gene_Symbol %in% gs
                official <- myUni[isGeneSymbol, c("Gene_Symbol", "Entrez")]
                official$Alias <- rep(x, nrow(official))
                return(official)
                })
            if( all(sapply(notOfficial, nrow)==1)) {
                out <- do.call("rbind", notOfficial)
                message("These Genes were reverted to their official Gene Symbol:")
                message(out)
                out <- rbind(Official, out)
                if( any( duplicated(out$Entrez) ) ) {
                    warning("There were duplicated Gene Symbols or Entrez IDs 
                        or you put a Gene Symbol along with its Entrez ID:"
                      , immediate.=TRUE)
                    message(out[duplicated(out$Entrez), ])
                }
                out <- unique(out[, c("Gene_Symbol", "Entrez")])
                out <- droplevels(out)
                return(out)
                # return(droplevels( unique(out[, c(1, 2)]) ) )
            } else {
                message("There is an ambiguity with some aliases:")
                bad_alias_2 <- sapply(notOfficial, length)!=1
                bad_alias_3 <- do.call("rbind", notOfficial[bad_alias_2])
                message( bad_alias_3 )
                message("Choose a correct Gene Symbol and start over :(")
                return( bad_alias_3 )
            }
        } else {
            wrongGenes <- !( bad_alias %in% c(myAliasUnmapped$Official_Gene_Symbol ,
                                                myAliasUnmapped$Alias) )
            if( any(wrongGenes) ) {
                wrongGenes <- bad_alias[wrongGenes]
                message("There are invalid Gene Symbol or Entrez IDs:")
                message(wrongGenes)
                stop("Check manually and start over :(")                
            }
            unmappedGenes <- ( bad_alias %in% c(myAliasUnmapped$Official_Gene_Symbol ,
                                                myAliasUnmapped$Alias) )
            if( any(unmappedGenes) ) {
                unmappedGenes <- bad_alias[unmappedGenes]
                message("There are valid genes that have not been mapped by LowMACA:")
                message(unmappedGenes)
                stop("We are sorry, remove these genes and start over :(")
            }
        }
    }
}

#.Trident_Score <- function(origMAlign , cons_mat="BLOSUM62", param=c(1 , 0.5 , 2) ) {
.Trident_Score <- function(origMAlign , cons_mat="BLOSUM62", param=c(1 , 1 , 0) ) {
            m <- consensusMatrix(origMAlign)
            freq_mat <- pcm2pfm(m)
            aminos <- rownames(m)[rownames(m)!='-']
            if(cons_mat=="BLOSUM62") {
                data("BLOSUM62", envir = environment())
                BLOSUM62 <- get("BLOSUM62", envir  = environment())
                myBLOSUM <- BLOSUM62[aminos , aminos]
            } else {
                myBLOSUM <- cons_mat[aminos , aminos]
            }
            ### force negative elements on the diagonal to be 0
            if(any(diag(myBLOSUM)<0)) {
                message('There are some negative elements in the diagonal elements of consensus matrix')
                message(diag(myBLOSUM)[diag(myBLOSUM)<0])
                message('Forcing them to be zero')
                diag(myBLOSUM)[diag(myBLOSUM)<0] <- 0
            }
            myBLOSUM_valdar <- sapply(aminos , function(col) {
                sapply(aminos , function(row) {
                        myBLOSUM[col , row]/sqrt(myBLOSUM[col , col]*myBLOSUM[row , row])
                    })
                })
            #first member
            lambda_t <- 1/log2(min(21 , nrow(origMAlign)))
            log2freq_mat <- ifelse( is.infinite(log2(freq_mat)) , 0 , log2(freq_mat))
            t <- lambda_t*(-colSums(freq_mat*log2freq_mat))
            #second member
            MyNorm <- function(x) sqrt(sum(x^2))
            lambda_r <- 1/( sqrt(20)*(max(myBLOSUM_valdar) - min(myBLOSUM_valdar)) )
            r <- sapply(1:ncol(freq_mat) , function(i) {
                pos <- freq_mat[ aminos , i]
                amin <- names(pos[pos!=0])
                if( length(amin)==0 ) {
                    return(1)
                } else if( length(amin)==1 ) {
                    return(0)
                } else {
                    myBLOSUM_cut <- myBLOSUM_valdar[ , amin]
                    x_mean <- (1/length(amin))*rowSums(myBLOSUM_cut)
                    distance <- sum(sapply(1:ncol(myBLOSUM_cut) , function(x) {
                                                eucl <- MyNorm(x_mean-myBLOSUM_cut[ , x])
                                    }))
                    out <- distance/length(amin)
                    out <- out*lambda_r
                    return(out)
                }
            })
            #third member
            if( "-" %in% rownames(freq_mat) )
                g <- freq_mat["-" , ]
            else
                g <- 0
            #Finally
            result <- ((1-t)^param[1])*((1-r)^param[2])*((1-g)^param[3])
            return(result)
}

.clustalOAlign <- function(genesData, clustal_cmd, filename , mail , perlCommand, use_hmm, datum){
    # # get the protein sequences corresponding to the selected genes
    # # from the uniprot_file
    # arrange the protein sequences to make a fasta format file
    fasta <- c()
    seq_names <- rownames(genesData)
    if(length(seq_names)>2000 && !is.null(mail))
        stop("You cannot evaluate more than 2000 sequences in web mode. Install a local clustalo")
    for (i in 1:nrow(genesData)) {
        seq_name <- paste( ">", seq_names[i], sep='')
        seq <- as.character(genesData[i, 'AMINO_SEQ'])
        fasta <- c(fasta, seq_name, seq)
    }
    # download hmm file
    if( use_hmm ) {
        hmmFile <- tempfile()
        # e.g. http://pfam.xfam.org/family/PF00071/hmm
        pfam <- as.character(genesData$Pfam_ID[1])
        # WebQuery <- paste0('http://pfam.xfam.org/family/', pfam, '/hmm')
        WebQuery <- paste0('http://pfam-legacy.xfam.org/family/', pfam, '/hmm')
        # https://www.ebi.ac.uk/interpro/entry/pfam/PF00071
        if(Sys.info()['sysname']=="Windows")
            download.file(WebQuery, destfile=hmmFile , mode="wb")
        else
            download.file(WebQuery, destfile=hmmFile)
    }

    # write the fasta file on a temporary file
    fastafile <- tempfile()
    write.table(fasta, fastafile, 
        quote=FALSE, row.names=FALSE, col.names=FALSE)
    clustalout_clu <- tempfile()
    # run clustaomega output in Clustal format
    if(nrow(genesData)>2) {
        if(is.null(mail)) {
            #clustalout_clu <- tempfile()
            ClustalOmega <- Sys.which(clustal_cmd)
            if(ClustalOmega=="")
                stop("Clustal Omega command not found. clustalo is not in your PATH or it was not installed")
                #clustal omega pairwise distance matrix
            dist_mat <- tempfile()
            dist_cmd <- paste("--distmat-out" , dist_mat , sep="=")
            if( use_hmm ) {
                exec <- paste(ClustalOmega, paste0('--hmm-in=', hmmFile), '--outfmt=clu' , '--percent-id' 
                    , dist_cmd , "--full --force" , '-i', fastafile, '>', clustalout_clu)
            }
            else {
                exec <- paste(ClustalOmega,'--outfmt=clu' , '--percent-id' 
                    , dist_cmd , "--full --force" , '-i', fastafile, '>', clustalout_clu)                
            }
            #Windows doesn't accept the redirection >, so we must use shell
            if( Sys.info()['sysname']=="Windows" ) {
                exec <- gsub("\\\\" , "/" , exec)
                cmdCheck <- shell(exec , intern=TRUE)
            } else {
                cmdCheck <- system(exec , intern=TRUE)
            }
            if( use_hmm ) unlink(hmmFile)
            #Check if system call to clustalo had 0 exit status
            failed <- !is.null(attr(cmdCheck, 'status')) && attr(cmdCheck, 'status') != 0
            if(failed)
                stop(paste("Alignment with ClustalOmega had non 0 exit status:",cmdCheck , sep="\n"))
            if( !(use_hmm | datum) ) 
            {
                score <- .scoreMatrix(dist_mat , mail=mail)
            } else {
                score <- NA                
            }
            aln <- .clustalMatrix(clustalout_clu)
            see_aln <- readAAMultipleAlignment(filepath = clustalout_clu
                , format="clustal")
            if( is.null(filename) ) {
                unlink(clustalout_clu)
            } else {
                file.rename(clustalout_clu, filename)
            }
            unlink(dist_mat)
            unlink(fastafile)
        } else {
            if( use_hmm ) stop("You cannot evaluate HMM in web mode. Install a local clustalo")
            perl <- .findPerl(perl=perlCommand)
            package.dir <- system.file(package='LowMACA')
            script <- file.path(package.dir,'clustalo_lwp.pl')
            mailArgument <- paste("--email" , mail)
            webOut <- file.path(tempdir() , "webClustal")
            webOutArgument <- paste("--outfile" , shQuote(webOut))
            exec <- paste(shQuote(perl)
                            , shQuote(script)
                            , "--noguidetreeout --stype protein --dismatout --outfmt clustal"
                            , mailArgument
                            , webOutArgument
                            , shQuote(fastafile)
                    )
            cmdCheck <- system(exec , intern=TRUE)
            failed <- !is.null(attr(cmdCheck, 'status')) && attr(cmdCheck, 'status') != 0
            if(failed)
                stop(paste("Alignment with ClustalOmega had non 0 exit status:",cmdCheck , sep="\n"))
            dist_mat <- paste0(webOut , ".pim.pim")
            clustalout_clu <- paste0(webOut , ".aln-clustal.clustal")
            score <- .scoreMatrix(dist_mat , mail=mail)
            aln <- .clustalMatrix(clustalout_clu)
            see_aln <- readAAMultipleAlignment(filepath = clustalout_clu
                , format="clustal")
            if( is.null(filename) ) {
                unlink(clustalout_clu)
            } else {
                file.rename(clustalout_clu, filename)
            }
            unlink(dist_mat)
            unlink(fastafile)
        }
    } else {
        if(nrow(genesData)==1) {
                # Fasta length
            len <- nchar(as.character(genesData[1, 'AMINO_SEQ']))
            aln <- data.frame(Gene_Symbol=rep(genesData[, 'Gene_Symbol'], len)
                           , Protein=rep(genesData[, 'UNIPROT'])
                           , Entrez=rep(genesData[, 'Entrez'])
                           , Align=1:len
                           , Ref=1:len
                        )
            see_aln <- AAMultipleAlignment(genesData[1, 'AMINO_SEQ'])
            score <- NA
                # Fasta length
            aln <- data.frame(Gene_Symbol=rep(genesData[ , 'Gene_Symbol'] , len)
                , domainID=rep(genesData[ , 'Pfam_ID'])
                , Entrez=rep(genesData[ , 'Entrez'])
                , Envelope_Start=rep(genesData[ , 'Envelope_Start'])
                , Envelope_End=rep(genesData[ , 'Envelope_End'] , len)
                , Align=1:len
                , Ref=1:len
                )
            score <- NA
        } else {
            if(is.null(mail)) {
            # With a 2 sequence alignment we do not create a distance matrix
                ClustalOmega <- Sys.which(clustal_cmd)
                if(ClustalOmega=="")
                    stop("Clustal Omega command not found. clustalo is not in your PATH or it was not installed")
                warning("There are less than 3 sequences aligned, so no distance matrix can be calculated" 
                    , immediate.=TRUE)
                exec <- paste(ClustalOmega, '--outfmt=clu' , '-i', fastafile, '>', clustalout_clu)
                #Windows doesn't accept the redirection with system >, so we must use shell
                if( Sys.info()['sysname']=="Windows" ) {
                    exec <- gsub("\\\\" , "/" , exec)
                    cmdCheck <- shell(exec , intern=TRUE)
                } else {
                    cmdCheck <- system(exec , intern=TRUE)
                }
                failed <- !is.null(attr(cmdCheck, 'status')) && attr(cmdCheck, 'status') != 0
                if(failed)
                    stop(paste("Alignment with ClustalOmega had non 0 exit status:",cmdCheck , sep="\n"))                
                score <- "It is not possible to calculate distance matrix with less than 3 sequences"
                aln <- .clustalMatrix(clustalout_clu)
                see_aln <- readAAMultipleAlignment(filepath = clustalout_clu
                    , format="clustal")
                if( is.null(filename) ) {
                    unlink(clustalout_clu)
                } else {
                    file.rename(clustalout_clu, filename)
                }
                unlink(fastafile)
            } else {
                perl <- .findPerl(perl=perlCommand)
                package.dir <- system.file(package='LowMACA')
                script <- file.path(package.dir,'clustalo_lwp.pl')
                mailArgument <- paste("--email" , mail)
                webOut <- file.path(tempdir() , "webClustal")
                webOutArgument <- paste("--outfile" , shQuote(webOut))
                exec <- paste(shQuote(perl)
                            , shQuote(script)
                            , "--noguidetreeout --stype protein --nodismatout --outfmt clustal"
                            , mailArgument
                            , webOutArgument
                            , shQuote(fastafile)
                    )
                cmdCheck <- system(exec , intern=TRUE)
                failed <- !is.null(attr(cmdCheck, 'status')) && attr(cmdCheck, 'status') != 0
                if(failed)
                    stop(paste("Alignment with ClustalOmega had non 0 exit status:",cmdCheck , sep="\n"))
                clustalout_clu <- paste0(webOut , ".aln-clustal.clustal")
                score <- "It is not possible to calculate distance matrix with less than 3 sequences"
                aln <- .clustalMatrix(clustalout_clu)
                see_aln <- readAAMultipleAlignment(filepath = clustalout_clu
                    , format="clustal")
                if( is.null(filename) ) {
                    unlink(clustalout_clu)
                } else {
                    file.rename(clustalout_clu, filename)
                }
                unlink(fastafile)
            }
        }
    }
    return(list(ALIGNMENT=aln, SCORE=score, CLUSTAL=see_aln))
}

.filterMAlign <- function(alignment, genes, datum) {
    ## ALIGNMENT element
    alignment$ALIGNMENT <- droplevels(alignment$ALIGNMENT[alignment$ALIGNMENT$Gene_Symbol %in% genes, ])
    ## SCORE element
    if( !datum ) {
        ix <- sort(unlist(lapply(genes, grep, x=rownames(alignment$SCORE$DIST_MAT))))
        if( length(ix)>2 ) {
            alignment$SCORE$DIST_MAT <- alignment$SCORE$DIST_MAT[ix,ix,drop=FALSE]
            alignment$SCORE$SUMMARY_SCORE <- cbind(
                MEAN_SIMILARITY=apply(alignment$SCORE$DIST_MAT, 1, mean, na.rm=TRUE)
                , MEDIAN_SIMILARITY=apply(alignment$SCORE$DIST_MAT, 1, median, na.rm=TRUE)
                , MAX_SIMILARITY=apply(alignment$SCORE$DIST_MAT, 1, max, na.rm=TRUE)
                , MIN_SIMILARITY=apply(alignment$SCORE$DIST_MAT, 1, min, na.rm=TRUE)
                )        
        } else {
            alignment$SCORE <- "It is not possible to calculate distance matrix with less than 3 sequences"
        }        
    }
    ## CLUSTAL element
    ix <- sort(unlist(lapply(genes, grep, x=alignment$CLUSTAL@unmasked@ranges@NAMES)))
    alignment$CLUSTAL <- 
        as(apply(as.matrix(alignment$CLUSTAL)[ix,,drop=FALSE],1,paste,collapse=''), "AAMultipleAlignment")
    ## return the filtered alignment
    return(alignment)

}

.clustalMatrix <- function(filename){
    origMAlign <- readAAMultipleAlignment(filepath = filename , format="clustal")
    origMAlign_mat <- Biostrings::as.matrix(origMAlign)
    out <- apply(origMAlign_mat, 1 , function(seq) {
        match <- rep(NA, length(seq))
        ix <- which(seq != '-')
        match[ix] <- 1:length(ix)
        return(match)
        })
    colnames(out) <- rownames(origMAlign_mat)
    out <- as.data.frame(out)
    out$Align <- 1:nrow(out)
    out <- melt(out, id.vars='Align')
    colnames(out)[2:3] <- c('Seq','Ref')
    domainID <- out$Seq
    lines_name <- do.call('rbind', 
        sapply(as.character(out$Seq), strsplit, '\\|'))
    rownames(lines_name) <- NULL
    envelope <- do.call('rbind', sapply(lines_name[,4], strsplit, ';'))
    envelope <- apply(envelope, 2, as.numeric)
    lines_name <- data.frame(domainID, lines_name[,1:3], envelope)
    colnames(lines_name) <- c('domainID', 'Gene_Symbol', 'Pfam', 'Entrez', 
        'Envelope_Start', 'Envelope_End')
    out <- cbind(lines_name, out[, c('Align', 'Ref')])
    out$Ref <- out$Ref + out$Envelope_Start - 1
    return(out) 
}

    #Calculate similarity matrix and some summary statistics
.scoreMatrix <- function(filename , mail){
    dist_seq <- read.table(filename , row.names=1 , skip=1)
    if(!is.null(mail)) {
        rownames(dist_seq) <- dist_seq$V2
        dist_seq$V2 <- NULL
    }
    colnames(dist_seq) <- rownames(dist_seq)
    dist_seq <- as.matrix(dist_seq)
    diag(dist_seq) <- NA
    mean_dist <- rowMeans(dist_seq , na.rm=TRUE)
    median_dist <- apply(dist_seq , 1 , function(x) median(x , na.rm=TRUE) )
    max_dist <- apply(dist_seq , 1 , function(x) max(x , na.rm=TRUE) )
    min_dist <- apply(dist_seq , 1 , function(x) min(x , na.rm=TRUE) )
    summary <- data.frame(MEAN_SIMILARITY=mean_dist
                            , MEDIAN_SIMILARITY=median_dist
                            , MAX_SIMILARITY=max_dist
                            , MIN_SIMILARITY=min_dist
                            )
    rownames(summary) <- rownames(dist_seq)
    if( any(summary$MAX_SIMILARITY <= 20)) {
        warning("There are sequences very dissimilar to the others! 
            Consider to exclude them because the maximum similarity with any other sequence is less than 20%" , immediate.=TRUE)
        message(summary[ summary$MAX_SIMILARITY <= 20 , ])
    }
    return(list(DIST_MAT=dist_seq , SUMMARY_SCORE=summary))
}

.getGeneMutations <- function(myGenes=myGenes
                            ,parallelize=FALSE  
                            ,mutation_type=c("missense", "all", "truncating" , "silent") 
                            ,NoSilent=TRUE 
                            ,Exonic=TRUE
                            ,tumor_type="all_tumors"
                            )
{
    # mycgds <- cgdsr::CGDS("http://www.cbioportal.org/")
    # all_cancer_studies <- cgdsr::getCancerStudies(mycgds)[,c(1,2)]
    mycgds <- cBioPortalData::cBioPortal(
      hostname = "www.cbioportal.org",
      protocol = "https",
      api. = "/api/api-docs")
    all_cancer_studies <- cBioPortalData::getStudies(mycgds)[ , c("cancerTypeId" , "studyId")]
    all_cancer_studies <- unique(all_cancer_studies)
    # all_cancer_studies <- cbind(all_cancer_studies$cancerTypeId , all_cancer_studies)
    # all_cancer_studies2 <- unique(
    #   data.frame(
    #     # Code=sapply(all_cancer_studies$cancer_study_id
    #     #     , function(x) strsplit(x , "_")[[1]][1])
    #     Code=all_cancer_studies$cancerTypeId
    #     , Full_Name=sapply(all_cancer_studies$name
    #                        , function(x) .myTrimmer(strsplit(x , "\\(")[[1]][1]))
    #   ))
    # all_cancer_studies3 <- aggregate(Full_Name~Code, all_cancer_studies2
    #                                  , FUN=function(x) {paste(x , collapse="|")})
    mutation_type <- mutation_type[1]
        # If I want just silent mutation, this overwrite NoSilent mode
    if(mutation_type=="silent") NoSilent=FALSE
    if(tumor_type[1]=="all_tumors") {
        chosenTumors <- all_cancer_studies[,1,drop=TRUE]
    } else {
        chosenTumors <- unlist(all_cancer_studies[
          grepl( paste(tumor_type , collapse="|") 
                , all_cancer_studies[,1,drop=TRUE] , ignore.case=TRUE) , 1])
        names(chosenTumors) <- all_cancer_studies[
          grepl( paste(tumor_type , collapse="|") 
            , all_cancer_studies[,1,drop=TRUE] , ignore.case=TRUE),2,drop=TRUE]
    }
    ### We don't take data from the most recent pancan atlas
    # pancan <- grep("_pan_can_atlas_2018" , all_cancer_studies[,1,drop=TRUE] , value=TRUE)
    # chosenTumors <- setdiff(chosenTumors , pancan)
    if(parallelize) {
      applyfun <- mclapply
      options('mc.cores'=detectCores())
    } else {
      applyfun <- lapply
    }
    out_double <- applyfun(names(chosenTumors) , function(i) {
      #for each Cancer Study, fetch the type of alteration (genetic profile) 
      #to be considered
      geneticProfile <- cBioPortalData::molecularProfiles(mycgds, i)
      geneticProfile <- geneticProfile[ grep("MUTATION" , geneticProfile$molecularAlterationType) , ]
      if(nrow(geneticProfile)==0){
        message(paste("geneticProfile" , i , "has no mutation data"))
        return( list( out=NULL , patients=NULL) )
      }
      # Fetch the patient list for the specified molecular profile and data type (mutations)
      caseList <- cBioPortalData::sampleLists(mycgds, i)
      caseList <- caseList[ caseList$category == "all_cases_with_mutation_data" , ]
      if(nrow(caseList)==0){
        message(paste("ProfileId" , i , "has no samples with mutations"))
        return( list( out=NULL , patients=NULL) )
      }
      # Get the actual case names (e.g. TCGA-AR-A1AR)
      caseListId <- cBioPortalData::getSampleInfo(api = mycgds 
                                                  , studyId = geneticProfile$studyId 
                                                  , sampleListIds = caseList$sampleListId)$sampleId
      tryCatch(
        muts <- cBioPortalData::mutationData(
          api = mycgds
          , molecularProfileIds = geneticProfile$molecularProfileId
          , entrezGeneIds = as.numeric(as.character(myGenes[ , 'Entrez']))
          , sampleIds = caseListId
        )[[1]]
        , error=function(e) message(paste("Impossible to retrive mutations from" , i , "study or no mutations are present on the selected genes"))
      )
      if(!exists("muts")) {
        # print("mut is the problem, line 619")
        muts <- NULL
        patients <- NULL
      } else {
        muts <- muts[ , c("patientId"
                          ,"entrezGeneId"
                          ,"studyId"
                          ,"mutationType"
                          ,"proteinChange")]
        patients <- muts$patientId
        # if(ncol(muts)!=22) {
        #   # print("mut is the problem, line 624")
        #   muts <- NULL
        #   patients <- NULL
        # } else {
        #   patientsToSplit <- as.character(caseList[sel, 'case_ids'])
        #   if(!is.character(patientsToSplit) || length(patientsToSplit)==0){
        #     # print("mut is the problem, line 630")
        #     muts <- NULL
        #     patients <- NULL
        #   } else {
        #     patients <- strsplit(patientsToSplit , split=" ")[[1]]            
        #   }
        # }
      }
      return( list( out=muts , patients=patients) )
    })
    names(out_double) <- names(chosenTumors)
    if(all(sapply(out_double , function(x) is.null(x$out)))) {
        message("There are no mutations available for the selected tumor types and genes")
        return(list( Mutations=NULL , AbsFreq=NULL ))
    }
        #This is the number of samples
    samples_out <- lapply(1:length(out_double) , function(x) out_double[[x]][["patients"]])
    names(samples_out) <- names(chosenTumors)
    shortNames <- sapply(strsplit(names(samples_out) , "_") , '[' , 1)
        #Create a set of samples per tumor type and not per study type (ex. brca_tcga and brca_pub will be joined)
    chosenTumors_type <- unique(unname(chosenTumors))
    npat_type <- sapply(chosenTumors_type , function(k) length(unique(unlist(samples_out[shortNames%in%k]))))
    # npat_type <- c()
    # for (i in chosenTumors_type) {
    #     selected_tum <- samples_out[grep(i , names(samples_out) , value=TRUE)]
    #     selected_pat <- unique(unlist(selected_tum))
    #     npat_type <- c(npat_type , length(selected_pat) )
    # }
    # names(npat_type) <- chosenTumors_type  
        #Mutation Dataset
    mut <- as.data.frame( 
                data.table::rbindlist(
                    lapply(1:length(out_double) , function(x) out_double[[x]][["out"]]) 
                    ))
    mut$Tumor_Type <- sapply(strsplit(mut$studyId , "_") , '[' , 1)
    myGeneTable <- unique(myGenes[ , c("Gene_Symbol" , "Entrez")])
    myGeneTable$Entrez <- as.numeric(as.character(myGeneTable$Entrez))
    myGeneTable$Gene_Symbol <- as.character(myGeneTable$Gene_Symbol)
    mut$gene_symbol <- plyr::mapvalues(mut$entrezGeneId 
                                       , from = myGeneTable$Entrez
                                       , to = myGeneTable$Gene_Symbol
                                       , warn_missing = FALSE)
    colnames(mut) <- c("case_id" , "entrez_gene_id" , "studyId" , "mutation_type" , "amino_acid_change" , "Tumor_Type" , "gene_symbol")
    # mut$Tumor_Type <- sapply(mut$genetic_profile_id , function(x) strsplit(x , split="_")[[1]][1])
    goodCols <- c("entrez_gene_id"
                ,"gene_symbol"
                ,"case_id"
                ,"mutation_type"
                ,"amino_acid_change"
                ,"Tumor_Type"
                )
    mut <- mut[ , goodCols]
    # Remove all the splice sites outside the coding region (reported as e1-2 or similar notations)
    mut <- mut[ !grepl("^e" , mut$amino_acid_change) , ]
    mut$letter <- substr(mut$amino_acid_change,1,1)
    mut$position <- as.numeric(as.character(stringr::str_extract(
                    string=mut$amino_acid_change,pattern="\\d+")))
    mut <- data.frame(
        Entrez=mut$entrez_gene_id
        , Gene_Symbol=mut$gene_symbol
        , Amino_Acid_Letter=mut$letter
        , Amino_Acid_Position=mut$position
        , Amino_Acid_Change=mut$amino_acid_change
        , Mutation_Type=mut$mutation_type
        , Sample=mut$case_id
        , Tumor_Type=mut$Tumor_Type)
    for (i in colnames(mut)){
        # if(class(mut[,i])=="factor")
        if(is(mut[,i] , "factor")){
            mut[,i] <- as.character(mut[,i])
        }
    }
    #Delete all non-SNVs mutation and all non-TCGA MutationType
    mut <- mut[ !is.na(mut$Mutation_Type) , ]
    bad_mut_types <- c("Fusion" , "COMPLEX_INDEL" , "vIII deletion" , "Splice_Site_SNP" , "Indel")
    mut <- mut[ !(mut$Mutation_Type %in% bad_mut_types) , ]
    mut <- mut[ !(mut$Amino_Acid_Change=="MUTATED") , ]
    if(NoSilent) {
        mut <- mut[ mut$Mutation_Type!="Silent" , ]
    }
    if(Exonic) {
        notTransc <- c("3'UTR"
                    ,"3'Flank"
                    ,"5'UTR"
                    ,"5'Flank"
                    ,"IGR1"
                    ,"IGR"
                    ,"Intron"
                    ,"RNA"
                    ,"Targeted_Region"
                    )
        mut <- mut[ !(mut$Mutation_Type %in% notTransc) , ] 
    }
    if( mutation_type=="missense" ) {
        missense <- c("Missense_Mutation"
                    ,"In_Frame_Del"
                    ,"In_Frame_Ins"
                    )
        mut <- mut[ mut$Mutation_Type %in% missense , ]
    }
    if( mutation_type=="silent" ) {
        mut <- mut[ mut$Mutation_Type=="Silent" , ]
    }
    if( mutation_type=="truncating" ) {
        truncating <- c("Frame_Shift_Del"
                        ,"Nonsense_Mutation"
                        ,"Translation_Start_Site"
                        ,"Frame_Shift_Ins"
                        ,"Nonstop_Mutation"
                        ,"Splice_Site"
                        ,"Indel"
                        )
        mut <- mut[ mut$Mutation_Type %in% truncating , ]
    }
    mut <- unique(mut)
        #Create frequency table by gene and tumor type
    tum_type <- factor(mut$Tumor_Type , levels=chosenTumors_type)
    gene <- tryCatch( factor(mut$Gene_Symbol , levels=levels(myGenes[ , "Gene_Symbol"])) 
                        , error=function(e) factor(mut$Gene_Symbol , levels=unique(mut$Gene_Symbol))
                        , warning=function(w) factor(mut$Gene_Symbol , levels=unique(mut$Gene_Symbol))
                        )
    sample_table <- as.data.frame.matrix(table(tum_type , gene ))
    sample_table2 <- data.frame(StudyID = chosenTumors_type
                                    ,Total_Sequenced_Samples = unname(npat_type)
                                    )
    sample_table_absFreq <- merge( sample_table2 , sample_table , by.x="StudyID" , by.y="row.names")
    return(list( Mutations=mut , AbsFreq=sample_table_absFreq ))
}

.getLocalGeneMutations <- function(myGenes=myGenes
                            ,localData=NULL  
                            ,mutation_type=c("missense", "all", "truncating" , "silent") 
                            ,NoSilent=TRUE 
                            ,Exonic=TRUE
                            ,tumor_type="all_tumors"
                            )
{
    if( is.null(localData) ) stop('no local file provided')
    ## all mutatios from local data
    mut <- localData
    #Delete all non-SNVs mutation and all non-TCGA MutationType
    mut <- mut[ !is.na(mut$Mutation_Type) , ]
    bad_mut_types <- c("Fusion" , "COMPLEX_INDEL" , "vIII deletion" , "Splice_Site_SNP" , "Indel")
    mut <- mut[ !(mut$Mutation_Type %in% bad_mut_types) , ]
    mut <- mut[ !(mut$Amino_Acid_Change=="MUTATED") , ]
    ## filter: genes
    chosenGenes <- myGenes$Gene_Symbol
    mut <- mut[mut$Gene_Symbol %in% chosenGenes, ]
    ## filter: mutation type
    mutation_user_choiche <- mutation_type[1]
    chosenMutations <- unique(mut$Mutation_Type)
    ## check flags
    if(mutation_user_choiche=="silent") NoSilent=FALSE
    if(NoSilent)
        chosenMutations <- chosenMutations[chosenMutations != "Silent"]
    if(Exonic) {
        notTransc <- c("3'UTR", "3'Flank", "5'UTR", "5'Flank"
                    ,"IGR1", "IGR", "Intron", "RNA", "Targeted_Region")
        chosenMutations <- chosenMutations[!chosenMutations %in% notTransc]
    }
    ## check user choiche
    if( mutation_user_choiche=="missense" ) {
        chosenMutations <- chosenMutations[chosenMutations %in% 
            c("Missense_Mutation", "In_Frame_Del", "In_Frame_Ins")]
    } else if ( mutation_user_choiche=="silent" ) {
        chosenMutations <- chosenMutations[chosenMutations == 'Silent']
    } else if( mutation_user_choiche=="truncating" ) {
        truncating <- c("Frame_Shift_Del"
                        ,"Nonsense_Mutation"
                        ,"Translation_Start_Site"
                        ,"Frame_Shift_Ins"
                        ,"Nonstop_Mutation"
                        ,"Splice_Site"
                        ,"Indel"
                        )
        chosenMutations <- chosenMutations[chosenMutations %in% truncating]
    }
    mut <- mut[mut$Mutation_Type %in% chosenMutations, ]
    ## filter: tumor types
    chosenTumors <- unique(mut$Tumor_Type)
    if( tumor_type[1]!="all_tumors" )
        chosenTumors <- chosenTumors[chosenTumors %in% tumor_type]
    mut <- mut[mut$Tumor_Type %in% chosenTumors, ]
    mut <- unique(mut)
    return(list( Mutations=mut , AbsFreq=NA ))
}

.alnWeights <- function(aln)
{
    aln_agg <- aln$ALIGNMENT
    aln_agg$pos_existance <- ifelse(is.na(aln_agg$Ref) , 0 , 1)
    aln_agg2 <- aggregate(pos_existance~Align , data=aln_agg , FUN=sum , simplify=TRUE)
    return(aln_agg2$pos_existance/sum(aln_agg2$pos_existance))
}

.MAD <- function(x) {
    med <- median(x , na.rm=TRUE)
    MAD <- median( abs(x - med) )
    return(1.4826 * MAD)
}

######################
###### calculate entropy
############################

.shannon <- function(q) {
    diff <- diff(q$x)[1]
    p <- q$y[q$y != 0]
    shan <- -sum(p*log(p))*diff
    return(shan)
}

.profileDensity <- function(profile, bw=NULL)
{
    nPos <- length(profile)
    positions <- which(as.logical(profile))
    positions <- rep(positions, times=profile[positions])
    if( is.null(bw) ) {
        d <- density(positions, from=1, to=nPos, n=nPos)
    } else {
        if( bw==0 ) {
            d <- list(x=1:nPos, y=profile, bw=0)
        } else {
            d <- density(positions, bw=bw, from=1, to=nPos, n=nPos)
        }
    }
    # normalize before output
    d$y <- d$y/sum(d$y)
    return(d)
}

.profileEntropy <- function(profile, bw=NULL, norm=TRUE, model=NULL, weights=NULL, ...) 
{
    d <- .profileDensity(profile, bw=bw, ...)
    ent <- .shannon(d)
    if( is.null(bw) ) bw <- d$bw
    if( norm ) {
        if( !is.null(model) ) {
            unif <- model(sum(profile))
        } else {
            len <- length(profile)
            nmut <- sum(profile)
            unif <- .sampleUnifEntropyL(len, nmut, bw=bw, weights=weights)
        }
        mean <- unif[[3]]-unif[[1]]
        var <- unif[[2]]^2
        ## check: if variance is 0, put the
        ## profile pvalue to zero
        if( var==0 ) {
            pval <- 1
        } else {
            shape <- mean^2/var
            scale <- var/mean
            pval <- pgamma(unif[[3]]-ent, shape=shape, scale=scale, lower.tail=FALSE)            
        }
        return(log10(pval))
    } else return(ent)
}

.sampleUnifEntropyL.old <- function(len, nmut, bw, nboot=1000, weights=NULL, 
    center=median, variability=.MAD)
{
    if(is.null(weights)) weights <- rep(1/len , len)
    boots <- sapply(1:nboot, function(i) {
        d <- density(sample(1:len, nmut, replace=TRUE , prob=weights), bw=bw, from=1, to=len, n=len)
        .shannon(d)
        })
    return(list(center=center(boots), variability=variability(boots), max=max(boots)))
}

.sampleUnifEntropyL <- function(len, nmut, bw, nboot=1000, weights=NULL, 
    center=median, variability=.MAD)
{
    if(is.null(weights)) weights <- rep(1/len , len)
    boots <- sapply(1:nboot, function(i) {
        positions <- sample(1:len, nmut, replace=TRUE , prob=weights)
        t <- table(positions)
        profile <- rep(0, len)
        profile[as.numeric(names(t))] <- t
        .profileEntropy(profile, bw=bw, norm=FALSE)
        })
    return(list(center=center(boots), variability=variability(boots), max=max(boots)))
}

.makeUniformModel <- function(mat, bw, nboot=1000, plotOUT=TRUE, 
    weights=NULL, center=median, variability=.MAD, parallelize=FALSE ) 
{
    if( parallelize ) {
        applyfun <- mclapply
    } else applyfun <- lapply
    geneLen <- ncol(mat)
    if( is.null(weights) ) weights <- rep(1/geneLen , geneLen)
    minNMut <- floor(sum(mat)/10)*10 #round to the upper ten
    minNMut <- ifelse(minNMut==0 , 1 , minNMut)
    maxNMut <- ceiling(sum(mat)/10)*10 #round to the upper ten
    maxNMut <- ifelse(maxNMut==0 , 1 , maxNMut)
    nMutInt <- unique(c(minNMut, maxNMut))
    if(length(nMutInt)==1) nMutInt <- c(nMutInt , nMutInt+1)
    outReal <- applyfun(nMutInt, function(i) 
        .sampleUnifEntropyL(geneLen, i, bw=bw, nboot=nboot , weights=weights,
            center=center, variability=variability))
    outReal <- do.call('cbind',outReal)
    polynomialModel <- function(x, par) {
        sapply( x
            , function(x_i)
                sum(sapply(1:length(par), function(i) 
                    if(is.na(par[i])) 0 else x_i^(i-1) * par[i] )))
    }
    pn.optim.aic <- function( tpts , experiment, variance=NULL ) {
        if( length(experiment) < 2 ) return(NA)
        polyOrderChisq <- function(i) {
            model <- lm( experiment~poly( tpts , i , raw=TRUE ) )
            return(list(par=model$coeff, value=AIC(model)))
        }
        sapply(1:min(30,length(tpts)-1), polyOrderChisq)
    }
    pnout <- pn.optim.aic(nMutInt, unlist(outReal[1,]), 1)
    degree <- min(which.min(unlist(pnout[2,])))
    par.mean <- pnout[1,degree]$par
    model.mean <- function(mut) polynomialModel(mut, par.mean)
    pnout <- pn.optim.aic(nMutInt, unlist(outReal[2,]), 1)
    degree <- min(which.min(unlist(pnout[2,])))
    par.sd <- pnout[1,degree]$par
    model.sd <- function(mut) polynomialModel(mut, par.sd)
    pnout <- pn.optim.aic(nMutInt, unlist(outReal[3,]), 1)
    degree <- min(which.min(unlist(pnout[2,])))
    par.max <- pnout[1,degree]$par
    model.max <- function(mut) polynomialModel(mut, par.max)
    modelUnif <- function(nmut) 
        list(mean=model.mean(nmut), sd=model.sd(nmut), max=model.max(nmut))
    if( plotOUT ) {
        par(mfrow=c(1,3))
        plot(nMutInt, unlist(outReal[1,]), xlab='n of mutations', ylab='',
            main='entropy center measure')
        lines(nMutInt, model.mean(nMutInt), col='red', lwd=3)
        plot(nMutInt, unlist(outReal[2,]), xlab='n of mutations', ylab='',
            main='entropy variability measure')
        lines(nMutInt, model.sd(nMutInt), col='red', lwd=3)     
        plot(nMutInt, unlist(outReal[3,]), xlab='n of mutations', ylab='',
            main='max entropy measure')
        lines(nMutInt, model.max(nMutInt), col='red', lwd=3)     
    }
    return(modelUnif)
}

.makeNullProfile <- function(mat, bw, nboot=1000, plotOUT=TRUE, 
    weights=NULL, center=median, variability=.MAD) 
{
    geneLen <- ncol(mat)
    if( is.null(weights) ) weights <- rep(1/geneLen , geneLen)
    nMut <- sum(mat)
    if( bw==0 ) {
        # to prevent having center and variability 
        # that both equals 0
        center <- mean
        variability <- sd
    }
    boots <- lapply(1:nboot, function(i) {
        # density(sample(1:geneLen, nMut, replace=TRUE , prob=weights), bw=bw, from=1, to=geneLen, n=geneLen)
        positions <- sample(1:geneLen, nMut, replace=TRUE , prob=weights)
        t <- table(positions)
        profile <- rep(0, geneLen)
        profile[as.numeric(names(t))] <- t
        .profileDensity(profile, bw=bw)
        })
    nullDensities <- sapply(boots, '[[', 'y')
    # calulate parameters for the gamma distribution
    mu <- apply(nullDensities, 1, center)
    s <- apply(nullDensities, 1, variability)
    s2 <- s^2
    # apply gamma distribution to find thresholds
    upperThreshold <- qgamma(.95, shape=mu^2/s2, scale=s2/mu)
    lowerThreshold <- qgamma(.05, shape=mu^2/s2, scale=s2/mu)

    # pvalue of every aa
    d <- .profileDensity(colSums(mat), bw=bw) #, from=1, to=geneLen, n=geneLen)
    pvals <- pgamma(d$y, shape=mu^2/s2, scale=s2/mu, lower.tail=FALSE)
    

    return(data.frame(
        mean=mu, lTsh=lowerThreshold, uTsh=upperThreshold, profile=d$y, pvalue=pvals#, qvalue=qvals
        ))
}
ste-depo/LowMACA documentation built on Oct. 15, 2022, 11:53 p.m.