Nothing
# Differetial Expression Analsyis to do after Model-based imputaion
#
# Ref: "A statistical framework for protein quantitation in bottom-up MS-based
# proteomics. Karpievitch Y, Stanley J, Taverner T, Huang J, Adkins JN,
# Ansong C, Heffron F, Metz TO, Qian WJ, Yoon H, Smith RD, Dabney AR.
# Bioinformatics 2009
#
# Written by Yuliya Karpievitch
#' Model-Based differential expression analysis
#'
#' Model-Based differential expression analysis is performed on peptide
#' level as desribed in
#' Karpievitch et al. 2009 "A statistical framework for protein quantitation in
#' bottom-up MS-based proteomics" Bioinformatics.
#' @param mm m x n matrix of intensities, num peptides x num samples
#' @param treatment vector indicating the treatment group
#' of each sample ie [1 1 1 1 2 2 2 2...]
#' @param prot.info 2+ colum data frame of peptide ID, protein ID, etc. columns
#' @param pr_ppos - column index for protein ID in prot.info.
#' Can restrict to be #2...
#'
#' @return A data frame with the following columns:
#' \describe{
#' \item{ProtID}{protein identification information taken from prot.info,
#' 1 column used to identify proteins}
#' \item{FC}{fold change}
#' \item{p-value}{p-value for the comparison between 2 groups
#' (2 groups only here)}
#' \item{BH-adjusted p-value}{Benjamini-Hochberg adjusted p-values}
#'}
#' @examples
#' data(mm_peptides)
#' head(mm_peptides)
#' # different from parameter names as R uses outer
#' # name spaces if variable is undefined
#' intsCols = 8:13
#' metaCols = 1:7 # reusing this variable
#' m_logInts = make_intencities(mm_peptides, intsCols)
#' m_prot.info = make_meta(mm_peptides, metaCols)
#' m_logInts = convert_log2(m_logInts)
#' grps = as.factor(c('CG','CG','CG', 'mCG','mCG','mCG'))
#'
#' set.seed(135) # results rarely vary due to the random seed for EigenMS
#' mm_m_ints_eig1 = eig_norm1(m=m_logInts,treatment=grps,prot.info=m_prot.info)
#' mm_m_ints_eig1$h.c # check the number of bias trends detected
#' mm_m_ints_norm = eig_norm2(rv=mm_m_ints_eig1)
#' mm_prot.info = mm_m_ints_norm$normalized[,1:7]
#' mm_norm_m = mm_m_ints_norm$normalized[,8:13]
#'
#' set.seed(131) # important to reproduce the results later
#' imp_mm = MBimpute(mm_norm_m, grps, prot.info=mm_prot.info,
#' pr_ppos=2, my.pi=0.05,
#' compute_pi=FALSE)
#' DE_res = peptideLevel_DE(imp_mm$y_imputed,
#' grps, mm_m_ints_norm$normalized[,metaCols],
#' pr_ppos=2)
#' @export
peptideLevel_DE = function(mm, treatment, prot.info, pr_ppos=2)
{
# Match to protein
all.proteins = unique(prot.info[,pr_ppos])
numProts = length(all.proteins) # 1569
y_out = matrix(NA, numProts, 5)
de_ret = NULL
u_prot_info = NULL
ll = length(all.proteins)
for (kk in seq_len(ll)) {
prot = all.proteins[kk]
pmid.matches = prot.info[prot.info[,pr_ppos]==prot,1]
curr_prot.info = prot.info[prot.info[,pr_ppos]==prot,]
idx.prot = which(prot.info[,1] %in% pmid.matches)
# need to return unique prot.info, make it as we go
ttt = prot.info[idx.prot,]
if(!is.null(dim(ttt))) {
u_prot_info = rbind(u_prot_info, ttt[1,])
} else {
u_prot_info = rbind(u_prot_info, ttt)
}
y_raw = mm[idx.prot,,drop=FALSE]
n.peptide = nrow(y_raw)
yy = as.vector(t(y_raw))
peptide = as.factor(rep(seq_len(n.peptide),
each=dim(data.frame(treatment))[1])) # used below
# keep track of prIDs here
curr_prot.info = curr_prot.info[kk,] # possibly a subset
# good to know how many peptides were in a given protein
y_out[kk,5] = n.peptide
if (n.peptide != 1){
# replicate treatment for # peptides
treatment_hold = treatment
treatment = rep(treatment, times=n.peptide)
res = stats::lm(yy ~ treatment + peptide,
contrasts = list(peptide="contr.sum"))
res1 = summary(res) # above line only good for 2 groups?
# col 3 reserved for BH p-values
y_out[kk,c(1,2,4)] = stats::coefficients(res1)[2,c(1,4,3)]
treatment = treatment_hold
} else {
res = stats::lm(yy~treatment) # only good for 2 groups?
res1 = summary(res)
# col 3 reserved for BH p-values
y_out[kk,c(1,2,4)] = stats::coefficients(res1)[2,c(1,4,3)]
} # coefs, p-value, t-value
} # end for each protein
colnames(y_out) = c('FC', 'P_val', 'BH_P_val', 'statistic', 'num_peptides')
# BH adjustment
y_out[,3] = stats::p.adjust(y_out[,2],"BH")
# add protein names as 1st col in a data frame
DE_res = data.frame(all.proteins, y_out, stringsAsFactors=FALSE)
de_ret$DE_res = DE_res
de_ret$prot.info = u_prot_info
cols1 = colnames(de_ret$DE_res)
cols1[1] = "ProtIDused"
cols2 = colnames(de_ret$prot.info)
de_ret = data.frame(de_ret, stringsAsFactors = FALSE)
colnames(de_ret) = c(cols1, cols2)
return(de_ret)
}
#' Plot peptide trends
#'
#' Plot Raw, Normalized and Normalized & Imputed peptide trends for a protein
#' @param mm matrix of raw intensities
#' @param prot.info metadata for the intensities in mm
#' @param sorted_norm_m normalized intensities, possibly fewer than in mm due
#' to filtering out peptides with fewer than one
#' observation per treatment group
#' @param sorted_prot.info metadata for the intensities in sorted_norm_m
#' @param imp_mm imputed intensities (post normalization)
#' @param imp_prot.info metadata for the imputed intensities in imp_mm
#' @param prot_to_plot protein ID to plot
#' @param prot_to_plot_col protein ID column index
#' @param gene_name gene ID to plot
#' @param gene_name_col gene ID to plot column index
#' @param mylabs sample labels to be plotted on x-axis
#'
#' @examples
#'
#' data("hs_peptides") # loads variable hs_peptides
#' intsCols = 8:13 # column indeces that contain intensities
#' m_logInts = make_intencities(hs_peptides, intsCols)
#' # replace 0's with NA's as NA's are more appropriate
#' # for anlysis and log2 transform
#' m_logInts = convert_log2(m_logInts)
#' # column indices that contain metadata such as protein IDs and sequences
#' metaCols = 1:7
#' m_prot.info = make_meta(hs_peptides, metaCols)
#' grps = as.factor(c('CG','CG','CG', 'mCG','mCG','mCG'))
#'
#' set.seed(135)
#' hs_m_ints_eig1 = eig_norm1(m=m_logInts,treatment=grps,prot.info=m_prot.info)
#' hs_m_ints_eig1$h.c = 2 # looks like there are 2 bias trends at least
#' hs_m_ints_norm = eig_norm2(rv=hs_m_ints_eig1)
#' hs_prot.info = hs_m_ints_norm$normalized[,metaCols]
#' hs_norm_m = hs_m_ints_norm$normalized[,intsCols]
#'
#' set.seed(125)
#' imp_hs = MBimpute(hs_norm_m, grps, prot.info=hs_prot.info,
#' pr_ppos=3, my.pi=0.05, compute_pi=FALSE)
#' mylabs = c( 'CG','CG','CG', 'mCG','mCG','mCG')
#' prot_to_plot = 'Prot32' # 43
#' gene_to_plot = 'Gene32'
#' plot_3_pep_trends_NOfile(as.matrix(hs_m_ints_eig1$m),
#' hs_m_ints_eig1$prot.info,
#' as.matrix(hs_norm_m),
#' hs_prot.info,
#' imp_hs$y_imputed,
#' imp_hs$imp_prot.info,
#' prot_to_plot, 3,
#' gene_to_plot, 4, mylabs)
#'
#' @return Nil
#' @export
plot_3_pep_trends_NOfile = function(mm, prot.info, sorted_norm_m,
sorted_prot.info,
imp_mm, imp_prot.info, prot_to_plot,
prot_to_plot_col,
gene_name, gene_name_col, mylabs) {
graphics::par(mfcol=c(1,3))
ppos = imp_prot.info[,prot_to_plot_col] == prot_to_plot
tmp = imp_mm[ppos,]
if(sum(ppos) == 1) { # only 1 row, duplicate 1 row...
tmp = rbind(tmp,tmp)
}
# take a Raw data, need these for y-limits
# unsorted but OK, peptides will be in different order
ppos2 = prot.info[,prot_to_plot_col] == prot_to_plot
tmp2 = mm[ppos2,]
if(sum(ppos2) == 1) { # only 1 row, duplicate 1 row...
tmp2 = rbind(tmp2,tmp2)
}
# could not normalize as all obs missing in 1+ group...
if(dim(tmp)[1] == 0 ) { tmp = tmp2 }
ylim_min = min(min(tmp, na.rm=TRUE), min(tmp2, na.rm=TRUE))
ylim_max = max(max(tmp, na.rm=TRUE), max(tmp2, na.rm=TRUE))
myylim = c(ylim_min-.2, ylim_max+.2) # give a bit of margin
main_title = paste(gene_name, ' (',
prot_to_plot, ') Normalized & Imputed', sep='')
graphics::par(mfcol=c(1,3))
graphics::par(mar=c(10,3,3,3))
# may need to transpose ylim will be dynamically set here,
# and kept for the other plots
graphics::matplot(t(tmp), type="l", main=main_title, xaxt = "n", ylim=myylim)
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs, las=3)
graphics::matpoints(t(tmp), type="p", pch='*') # , ylim=c(15,35))
graphics::lines(colMeans(tmp,na.rm = TRUE), lwd=3, col='black')
limits = graphics::par("usr") # upper and low ylim and upper and lower xlim
myxlim = c(limits[1], limits[2])
ppos = sorted_prot.info[,prot_to_plot_col] == prot_to_plot #
tmp = sorted_norm_m[ppos,]
if(sum(ppos) == 1) { # only 1 row, duplicate 1 row...
tmp = rbind(tmp,tmp)
}
# could not normalize as all obs missing in 1+ group...
if(dim(tmp)[1] == 0 ) { tmp = tmp2 }
main_title = paste(gene_name, ' (', prot_to_plot, ') Normalized', sep='')
graphics::matplot(t(tmp), type="l", pch='19', main=main_title,
ylim=myylim, xlim=myxlim, xaxt = "n") # may need to transpose
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs, las=3)
graphics::matpoints(t(tmp), type="p", pch='*')
graphics::lines(colMeans(tmp,na.rm = TRUE), lwd=3, col='black')
main_title = paste(gene_name, ' (', prot_to_plot, ') Raw', sep='')
graphics::matplot(t(tmp2), type="l", main=main_title, ylim=myylim, xaxt="n")
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs, las=3)
graphics::matpoints(t(tmp2), type="p", pch='*') # , ylim=c(15,35))
graphics::lines(colMeans(tmp2,na.rm = TRUE), lwd=3, col='black')
graphics::par(mar=c(3,3,3,3))
}
#' Plot trends for a single protien
#'
#' Plot peptide trends for a protein
#' @param mm matrix of raw intensities
#' @param prot.info metadata for the intensities in mm
#' @param prot_to_plot protein ID to plot
#' @param prot_to_plot_col protein ID column index
#' @param gene_name gene ID to plot
#' @param gene_name_col gene ID to plot column index
#' @param colors what colors to plot peptide abundances as,
#' most commonly should be
#' treatment groups
#' @param mylabs sample labels to be plotted on x-axis
#' @return Nil
plot_1prot = function(mm, prot.info, prot_to_plot, prot_to_plot_col,
gene_name, gene_name_col, colors, mylabs) {
graphics::par(mfcol=c(1,1))
graphics::par(mar=c(10,3,3,3))
ppos = prot.info[,prot_to_plot_col] == prot_to_plot
tmp = mm[ppos,]
if(sum(ppos) == 1) {
# only 1 row, duplicate 1 row to be able to sued matplot function
tmp = rbind(tmp,tmp)
}
# may need to transpose ylim will be dynamically set here,
# and kept for the other plots
graphics::matplot(t(tmp), type="l", main=prot_to_plot, xaxt="n", col=colors)
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs, las=3)
graphics::matpoints(t(tmp), type="p", pch='*', col=colors)
graphics::lines(colMeans(tmp,na.rm = TRUE), lwd=3, col='black')
graphics::par(mar=c(3,3,3,3))
}
plot_3_peptide_trends = function(mm, prot.info, sorted_norm_m,
sorted_prot.info, imp_mm,
imp_prot.info, prot_to_plot,
gene_name, mylabs) {
graphics::par(mfcol=c(1,3))
ppos = imp_prot.info[,2] == prot_to_plot
tmp = imp_mm[ppos,]
if(sum(ppos) == 1) { # only 1 row, duplicate 1 row...
tmp = rbind(tmp,tmp)
}
# take a Raw data, need these for y-limits
# unsorted but OK, peptides will be in different order
ppos2 = prot.info[,2] == prot_to_plot
tmp2 = mm[ppos2,]
if(sum(ppos2) == 1) { # only 1 row, duplicate 1 row...
tmp2 = rbind(tmp2,tmp2)
}
if(dim(tmp)[1] == 0 ) { tmp = tmp2 }
# could not normalize as all obs missing in 1+ group...
ylim_min = min(min(tmp, na.rm=TRUE), min(tmp2, na.rm=TRUE))
ylim_max = max(max(tmp, na.rm=TRUE), max(tmp2, na.rm=TRUE))
myylim = c(ylim_min-.2, ylim_max+.2) # give a bit of margin
main_title = paste(gene_name, ' (', prot_to_plot,
') Normalized & Imputed', sep='')
outfnames_png = paste(gene_name, '_', prot_to_plot, '_3pepTrends.png',sep='')
grDevices::png(outfnames_png, width = 20, height = 6.4, units='in', res=400)
# R cannot figue out how & when res is specifed... (?)
graphics::par(mfcol=c(1,3))
graphics::matplot(t(tmp), type="l", main=main_title, xaxt="n", ylim=myylim)
# may need to transpose ylim will be dynamically set here,
# and kept for the other plots
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs)
graphics::matpoints(t(tmp), type="p", pch='*') # , ylim=c(15,35))
graphics::lines(colMeans(tmp,na.rm = TRUE), lwd=3, col='black')
limits = graphics::par("usr") # upper and low ylim and upper and lower xlim
myxlim = c(limits[1], limits[2])
ppos = sorted_prot.info[,2] == prot_to_plot #
tmp = sorted_norm_m[ppos,]
if(sum(ppos) == 1) { # only 1 row, duplicate 1 row...
tmp = rbind(tmp,tmp)
}
# could not normalize as all obs missing in 1+ group...
if(dim(tmp)[1] == 0 ) { tmp = tmp2 }
main_title = paste(gene_name, ' (', prot_to_plot, ') Normalized', sep='')
graphics::matplot(t(tmp), type="l", pch='19', main=main_title,
ylim=myylim, xlim=myxlim, xaxt = "n") # may need to transpose
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs)
graphics::matpoints(t(tmp), type="p", pch='*')
graphics::lines(colMeans(tmp,na.rm = TRUE), lwd=3, col='black')
main_title = paste(gene_name, ' (', prot_to_plot, ') Raw', sep='')
# may need to transpose
graphics::matplot(t(tmp2), type="l", main=main_title, ylim=myylim, xaxt="n")
graphics::axis(1, at=seq_len(length(mylabs)), labels=mylabs)
graphics::matpoints(t(tmp2), type="p", pch='*') # , ylim=c(15,35))
graphics::lines(colMeans(tmp2,na.rm = TRUE), lwd=3, col='black')
grDevices::dev.off()
}
# function takes a vector of string gene IDs possibly separated by a ';'
# and returns a vector of the same lenfth with only the first gene ID
get1sttoken = function(ids)
{
ll = length(ids) # 22780
prots = NULL
for(ii in seq_len(ll)) {
tt_spl = strsplit(ids[ii], ';')
prots = c(prots, c(tt_spl[[1]][1]))
}
return(prots)
}
# function takes a vector of string gene IDs possibly separated by a ';'
# and returns a vector of the same length with the first gene ID
# with a '+' if more than 1 gene ID was present
get1sttokenPlus = function(ids)
{
ll = length(ids)
prots = NULL
for(ii in seq_len(ll)) {
tt_spl = strsplit(ids[ii], ';')
tt = tt_spl[[1]][1]
if(length(tt_spl[[1]]) > 1) {
tt = paste(tt, '+', sep='')
}
prots = c(prots, tt)
}
return(prots)
}
# function takes a vector of string Protein IDs
# and removes any -2 from the end of the protein name
# input is one protein per row
clip_protID_end = function(ids){
ll = length(ids)
prots = vector('character', length = ll)
for(ii in seq_len(ll)) {
ppos = regexpr("-", ids[ii], fixed=TRUE)[1]
if(ppos != -1) {
prots[ii] = substr(ids[ii], 1, (ppos-1) )
} else {
prots[ii] = ids[ii]
}
}
return(prots)
}
## check for the presence of '+' sign in the
## specificed columns but column number
remove_contaminats_symbol = function(dd, colsCheck=c('Reverse',
'Potential.contaminant'),
colsCheckSymbol=c('+', '+') )
{
# get column indeces for the specified column names,
# eg: 'Reverse' and 'Potential.contaminant'
# access 'dd' by the column index to check for '+'
cols1 = colnames(dd)
ll = length(colsCheck)
for(ii in seq_len(ll)) {
ppos = cols1 == colsCheck[ii]
# position of the column we need to check
colToCheck = seq_len(length(cols1))[ppos]
tt1 = dd[,colToCheck] == '+'
message('Removing ', sum(tt1), ' rows via ', colsCheck[ii], ' column')
dd = dd[!tt1,]
}
return(dd)
}
## check for the presence of 'REV_', 'CON_' in the
## specificed columns but column number
remove_contaminats_prefix = function(dd,
colsCheck = c('Leading.razor.protein',
'Leading.razor.protein'),
colsCheckPrefix=c('REV_', 'CON_') )
{
# get column indeces for the specified column names,
# eg: 'Reverse' and 'Potential.contaminant'
# access 'dd' by the column index to check for matches
# to the prefix in the protein names
cols1 = colnames(dd)
ll = length(colsCheck)
for(ii in seq_len(ll)) {
ppos = cols1 == colsCheck[ii]
# position of the column we need to check
colToCheck = seq_len(length(cols1))[ppos]
ppos1 = gdata::startsWith(as.character(dd[,colToCheck]),
colsCheckPrefix[ii])
message('Removing ', sum(ppos1), ' rows via ', colsCheck[ii], ' column')
dd = dd[!ppos1,]
}
return(dd)
}
# in case of GeneID column having multiple gene names separated by ';'
# take the first one and make it the GeneID column,
# store original GeneID column in GeneIDLong
generate_single_geneID = function(dd, colCheck)
{
cols1 = colnames(dd)
ppos = cols1 == colCheck
colToCheck = seq_len(length(cols1))[ppos]
dd$GeneIDLong = dd[,colToCheck]
dd$GeneID = get1sttoken(as.character(dd[,colToCheck]) )
return(dd)
}
# in case of ProtID column having -2 appended to the protein name
# (or any other -#), eg: Q86U42-2
# take the portion prior to -# and make it the ProtID column,
# store original ProtID column in GeneIDLong
# If there is not -2 or other number following the protein name,
# the name will be unchanged
# Protein names with -2 may create problems when matching tot he
# Ensembl IDs - multiple organizms only!
generate_clipped_ProtID = function(dd, colCheck)
{
cols1 = colnames(dd)
ppos = cols1 == colCheck
colToCheck = seq_len(length(cols1))[ppos]
dd$ProtIDLong = dd[,colToCheck]
dd$ProtID = clip_protID_end(as.character(dd[,colToCheck]) )
return(dd)
}
# function to use after we have fitered duplicate
# Protein and Gene IDs based on Ens ID
# on rare occasion there exist 2 EnsIDs for the same
# GeneID but for different ProtID
# or valid protein ID in one row and missing protein ID
# int he 2nd row. Never seen 3 rows...
remove_dup_geneIDs = function(db)
{
ppos_genid = db$GeneID !=''
db_geneID = db[ppos_genid,]
dim(db_geneID)
db_nogeneID = db[!ppos_genid,]
dim(db_nogeneID)
ppos = duplicated(db_geneID$GeneID )
sum(ppos)
dup_geneIDs = db_geneID[ppos,1]
ppos_dup = db_geneID$GeneID %in% dup_geneIDs
length(ppos_dup)
sum(ppos_dup)
db_geneID_nodup = db_geneID[!ppos_dup,]
dim(db_geneID_nodup)
db_geneID_dup = db_geneID[ppos_dup,]
dim(db_geneID_dup)
# eliminate rows with no protID in the dup ones
# and concatenate back the matrices
u_dup_genids = unique(db_geneID_dup$GeneID) # 53
nn = length(u_dup_genids)
undup_db = NULL
for(ii in seq_len(nn)) {
pcur = db_geneID_dup$GeneID == u_dup_genids[ii]
cur_gene = db_geneID_dup[pcur,]
# keep row with ProtID
ppos_protid = cur_gene$ProtID != ''
if(sum(ppos_protid) > 1) {
# keep the bottm row - in our dataset we get
# that as Leading Razor Protein...
undup_db = rbind(undup_db,cur_gene[2,])
# http://www.enzolifesciences.com/ADI-SPP-502/hsp70-a1-mouse-recombinant/
} else {
undup_db = rbind(undup_db,cur_gene[ppos_protid,])
}
}
# combine db back together without the duplicates
# db_geneID_nodup db_nogeneID undup_db
db_recomb = rbind(db_geneID_nodup, db_nogeneID, undup_db)
return(db_recomb)
}
###############################################################################
## functions to help add Ensembl IDs to various organizms
# for each GeneID take all of the unique EnsIDs
# (and ProtIDs?) and make into a list?
# make a data structure:
# HS$GeneID - vector of unique gene IDs, strings
# HS$EnsID - vector of lists of all EnsIDs that match to this gene
# HS$ProtID - vector of lists of all ProtIDs that match to this gene
# HS$numEnsList - vector of values of number of elelments in teh list of EnsIDs
# must have 3 columns: GeneID, ProtID and EnsID obtained from Biomart
make_ID_structure = function(ens_ids) {
ugenes = unique(ens_ids$GeneID)
ll = length(ugenes) # 19086
HS = NULL
HS$GeneID = NULL
HS$EnsID = vector(mode="list", length=ll)
HS$ProtID = vector(mode="list", length=ll)
HS$NumEnsList = vector(mode="numeric", length=ll)
for(ii in seq_len(ll)) {
HS$GeneID[ii] = ugenes[ii]
ppos = ens_ids$GeneID == HS$GeneID[ii]
HS$EnsID[[ii]] = ens_ids$EnsID[ppos]
# we already dropped rows with NO ProtID, so should not have blank ProtIDs
HS$ProtID[[ii]] = NULL
HS$ProtID[[ii]] = ens_ids$ProtID[ppos]
HS$NumEnsList[ii] = length(HS$EnsID[[ii]])
}
return(HS)
}
get_EnsIDs = function(organizm='human') {
lookup = matrix('', nrow = 2, ncol = 3)
lookup[1,] = c('human', 'hsapiens_gene_ensembl', 'hgnc_symbol')
lookup[2,] = c('mouse', 'mmusculus_gene_ensembl', 'mgi_symbol')
# to be continued...
lookup_pos = lookup[,1] == organizm
ensembl = biomaRt::useMart("ensembl", dataset=lookup[lookup_pos,2])
gene_symbol = lookup[lookup_pos,3]
res = biomaRt::getBM(attributes=c(gene_symbol, 'uniprotswissprot',
'ensembl_gene_id'), mart=ensembl)
dim(res) # human: 37593
colnames(res) = c('GeneID', 'ProtID', 'EnsID')
# remove rows with blank ProtID, will not be an option
# on the proteomics dataset (?)
ppos = res$ProtID == ''
sum(ppos)
res = res[!ppos,]
ens_ids = make_ID_structure(res)
return(ens_ids)
}
# dd1 is the datastructure returned from make_ID_structure()
# must contain dd1$EnsID column
# linker - 2 column matrix with EnsIDs that match to EnsIDs
# in dd1 as well as the other organizm EnsIDs
# which will be added as a list to dd1
match_linker_ids = function(dd1, linker, l_col) {
ll1 = length(dd1$EnsID) # 19086
ll1
match1 = vector(mode="list", length=ll1)
for(ii in seq_len(ll1)) {
if(ii%%100 == 0) { message('Processing Gene ', as.character(ii) ) }
ppos = linker[,l_col] %in% dd1$EnsID[[ii]]
match1[[ii]] = unique(linker[ppos,-l_col])
linker = linker[!ppos,]
}
dd1$linkedIDs = match1
return(dd1)
}
# mm_ens_ids_save = mm_ens_ids
# 2 structures "EnsID" "ProtID" "NumEnsList" "GeneID" "linkedIDs"
# dd1 will be matched by EsnID column and dd2 by linkedIDs
match_ids = function(dd1, dd2) {
ll1 = length(dd1$GeneID)
ll2 = length(dd2$GeneID)
# will use NA's instead of default 0's
match1 = rep(NA, each=ll1)
match2 = rep(NA, each=ll2) # vector('numeric', length=ll2)
num_match1 = vector('numeric', length=ll1)
match_index = 1
# add index such that we could track the original position of
# the gene when elements are removed
dd2$index = seq_len(ll2)
message('Processing Gene 1')
for(ii in seq_len(ll1)) {
if(ii%%500 == 0) { message('Processing Gene ', as.character(ii) ) }
ll2 = length(dd2$GeneID) #will shrink as we go through the matching process
for(jj in seq_len(ll2)) {
ppos = dd1$EnsID[[ii]] %in% dd2$linkedIDs[[jj]]
if (sum(ppos) > 0) {
# we got a match
num_match1[ii] = sum(ppos)
match1[ii] = match_index
match_2_index = dd2$index[jj]
match2[match_2_index] = match_index
# remove elements that matched from the structure # 2
dd2$GeneID = dd2$GeneID[-jj]
dd2$EnsID = dd2$EnsID[-jj]
dd2$ProtID = dd2$ProtID[-jj]
dd2$NumEnsList = dd2$NumEnsList[-jj]
dd2$index = dd2$index[-jj]
dd2$linkedIDs = dd2$linkedIDs[-jj]
match_index = match_index + 1
break # out of inner loop hopefully
}
}
}
ret = NULL
ret$match1 = match1
ret$match2 = match2
return(ret)
}
# use hs_with_MM & mm_with_HS
# add matched IDs to the structures for different organisms
# function uses match_ids() to get most of the work done
# dd1 and dd2 must be structures returned from match_linker_ids
add_match_ids = function(dd1, dd2) {
matched_ids = match_ids(dd1, dd2)
dd1$matchedID = matched_ids$match1
dd2$matchedID = matched_ids$match2
ret = NULL
ret$dd1 = dd1
ret$dd2 = dd2
return(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.