###############################################################
# cNMTF
# 1. Preprocessing functions
# 1.4 Functions to construct networks
#
# Corresponding author:
# Luis Leal, Imperial College London, email: lgl15@imperial.ac.uk
###############################################################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Constructs the SNV-SNV network
#' @description Function to create a SNV-SNV network using a PPI reference network
#'
# [Input]
# Parameters for the reference network
#' @param net.type Type of reference network. Default "ppi"
#' @param dedges Object with edges from reference network
#'
# Parameters for Linkage Disequilibrium
#' @param remove.highLD Expand SNV consequences to SNVs in high LD
#' @param ld.tao Treshold of LD. Default 0.8
#' @param res.ld Table of LD
#' @param keep.with.LD List of SNPs to include even if they are in LD
#'
# Parameters to counstruct Wu
#' @param R.snps List of SNVs in R
#' @param work.dat Working directory
#' @param trait.project Trait
#' @param n.cores Number of cores for parallel computing
#' @param tmap Mapping of SNPs to genes
#' @param venn.diag Logical. Print Venn diagrams. Default = FALSE.
#' @param plot.file File to print Venn diagrams and node degree distribution
#' @param snps.known SNVs known to be associated with the trait
#'
# [Output]
#' @return
#' * \code{Wu}, \code{G}: adjacency matrix and graph object of the network
#' * Table with node properties to be used in Cytoscape
#' * Venn diagrams of damaging variants and node degree distribution
#'
#' @md
#' @author Luis G. Leal, \email{lgl15@@imperial.ac.uk}
#' @family Preprocessing functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
create.network = function( #Parameters for the reference network
net.type = "ppi", #Type of reference network
dedges = NULL, #Object with edges from reference network paste(work.dat,"data/ref_networks/", net.type, "_edges_",trait.project,".txt",sep="")
#Parameters for Linkage Disequilibrium
remove.highLD = TRUE,
ld.tao = 0.8, #Treshold of LD
res.ld = NULL, #Table of LD
keep.with.LD = NULL, #List of SNPs to include even if they are in LD
#Parameters to counstruct Wu
R.snps = NULL, #List of SNPs in R
work.dat = NULL, #Working directory
trait.project = NULL, #Trait
n.cores = 4, #Number of cores
tmap = NULL, #Mapping of SNPs to genes
plot.file = NULL, #File to print Venn diagrams and node degree distribution
venn.diag = FALSE,#Print Venn diagrams?
snps.known = NULL ) #SNPs known to be associated with the trait
{
#-------------------------------------------------
# 1. Load/construct reference network ----
#-------------------------------------------------
#Read file with edges
cat("Dimmensions of table of edges (dedges):", "\n"); print(dim(dedges))
#-------------------------------------------------
# 2. Find SNP localization/consequences ----
#-------------------------------------------------
#Extract information of SNP consequences from ENSEMBL
ensembl.snp <- useMart(biomart="ENSEMBL_MART_SNP", dataset = "hsapiens_snp", host = "grch37.ensembl.org" ) #listAttributes(ensembl.snp)
snp.conseq = getBM(attributes=c("refsnp_id", "consequence_type_tv", "consequence_allele_string", "polyphen_prediction", "polyphen_score", "sift_prediction", "sift_prediction"), filters="snp_filter", values=R.snps, mart=ensembl.snp)
#Check quality of data
cat("Quality of the data", "\n")
cat("SNV consequences ENSEMBL:", "\n"); print( table(snp.conseq$consequence_type_tv) )
cat("SNV consequences Polyphen:", "\n"); print( table(snp.conseq$polyphen_prediction) )
cat("SNV consequences Sift:", "\n"); print( table(snp.conseq$sift_prediction) )
#Check how many hits per predictor
cat("Number of SNVs with consequences ENSEMBL:", "\n"); print( length(unique(snp.conseq$refsnp_id[ snp.conseq$polyphen_prediction != "" ])))
cat("Number of SNVs with consequences Polyphen:", "\n"); print( length(unique(snp.conseq$refsnp_id[ snp.conseq$sift_prediction != "" ])) )
cat("Number of SNVs with consequences Sift:", "\n"); print( length(unique(snp.conseq$refsnp_id[ snp.conseq$consequence_type_tv != "" ])) )
#Extract list of SNVs with damaging/deleterious effects
snp.poly = unique(snp.conseq$refsnp_id[ snp.conseq$polyphen_prediction %in% c("possibly damaging", "probably damaging") ])
snp.sift = unique(snp.conseq$refsnp_id[ snp.conseq$sift_prediction %in% c( "deleterious", "deleterious - low confidence" ) ])
#Extract list of SNPs with HIGH impact according to ENSEMBL
snp.impact = unique(snp.conseq$refsnp_id[ snp.conseq$consequence_type_tv %in% c("transcript_ablation", "splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant", "start_lost", "transcript_amplification", "inframe_insertion", "inframe_deletion", "missense_variant", "protein_altering_variant") ] ) #Source: https://www.ensembl.org/info/genome/variation/predicted_data.html ])
#-------------------------------------------------
# 3. Share consequences to other SNPs in high LD ----
#-------------------------------------------------
if( remove.highLD == TRUE & length(res.ld) > 1){
#Expand consequences to other SNPs in high LD
snp.poly = unique( c(snp.poly, as.character(res.ld$loc1) [ res.ld$loc2 %in% snp.poly & res.ld$r2 > ld.tao ], as.character(res.ld$loc2) [ res.ld$loc1 %in% snp.poly & res.ld$r2 > ld.tao ] ) )
snp.sift = unique( c(snp.sift, as.character(res.ld$loc1) [ res.ld$loc2 %in% snp.sift & res.ld$r2 > ld.tao ], as.character(res.ld$loc2) [ res.ld$loc1 %in% snp.sift & res.ld$r2 > ld.tao ] ) )
#Define vector of high LD SNPs to remove
set.out = unique(as.character(res.ld$loc2[ res.ld$r2 > ld.tao] ))
#Include some SNPs specified by the user even if they are SNPs in LD
set.out = setdiff( set.out, keep.with.LD )
#Filter vectors
snp.poly = snp.poly [ !(snp.poly %in% set.out) ]
snp.sift = snp.sift [ !(snp.sift %in% set.out) ]
snp.impact = snp.impact [ !(snp.impact %in% set.out) ]
snps.known = snps.known [ !(snps.known %in% set.out) ]
}else{
set.out = NULL
}
#-------------------------------------------------
# 4. Venn Diagram ----
#-------------------------------------------------
#Merge lists into one
ldamaging = unique(c(snp.poly, snp.sift, snp.impact))
#Check number of damaging variants and known associations
length(ldamaging)
cat( length(snps.known), "SNVs known to be associated with the trait. source: GWAS catalogue.", "\n")
cat( length(snp.impact), "SNVs with high or moderate impact (e.g, frameshift variant, stop gained) source: Ensembl.", "\n")
cat( length(union(snp.poly,snp.sift)), "deleterious SNVs. source: Sift and Polyphen.", "\n")
cat( "Total:", length(union(ldamaging,snps.known)), "damaging SNVs", "\n" )
if( venn.diag == TRUE){
#Create venn diagrams for each source
venn.list = list(Sift.Poly = union( snp.poly, snp.sift), ENSEMBL = snp.impact, GWAS = snps.known)
fill.venn = c('blue', 'orange', 'green', 'yellow', 'brown')[1:(length(venn.list))]
venn.plot = venn.diagram( x = venn.list ,
filename = NULL,
output = TRUE ,
imagetype="png" ,
resolution = 300,
compression = "lzw",
lty = 'blank',
fill = fill.venn,
cex = rep( 2, c(1,3,7,15,31)[length(venn.list)]),
cat.cex = 2, margin = 0.1)
#Plot diagram to file
pdf(plot.file, width = 6, height = 6)
plot.new()
par(mfrow = c(1,1))
grid.draw( venn.plot )
}
#-------------------------------------------------
# 5. Construct the network guided by damaging SNVs ----
#-------------------------------------------------
#Hubs in the network
lhubs = union( ldamaging, snps.known )
#Run function to construct and print the network
res.Wu = construct.Wu(R.snps = R.snps[ !(R.snps %in% set.out) ], work.dat = work.dat,
tmap = tmap, net.type = net.type, dedges = dedges, #Network as a list of edges
trait.project = trait.project, n.cores = n.cores,
alledges.snp = FALSE, lhubs = lhubs)
#Check density and node degree distribution
degree.distribution( res.Wu[[1]], gamma.kd = 2, weighted = TRUE)
dev.off()
#Number of damaging and known SNPs connected in the network
cat("Number of damaging SNPs connected in the network", sum( R.snps[ rowSums( res.Wu[[1]] )>0] %in% ldamaging), "\n", as.character(Sys.time()), "\n")
cat("Number of known SNPs connected in the network", sum( R.snps[ rowSums( res.Wu[[1]] )>0] %in% snps.known), "\n", as.character(Sys.time()), "\n")
#Create with node properties for Cytoscape
snp.cat = rep("Candidate",length(R.snps))
snp.cat[ R.snps %in% ldamaging ] <- "Damaging"
snp.cat[ R.snps %in% snps.known ] <- "Known"
#Add Gene
snp.gene = as.character(tmap$entrezgene[ match(R.snps, tmap$refsnp_id)])
R.snps[which(is.na(snp.gene))] #SNPs without linked gene
#Write table
cat("Writing table with node properties for Cytoscape", "\n");
write.table( cbind(R.snps, snp.cat, snp.gene), file = paste(work.dat, "Gu_",net.type, "_", trait.project, "_attributes.txt",sep=""), row.names = FALSE, quote = FALSE)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Creates edge list for the SNV-SNV network
#' @description Function to find pairs of SNVs linked in the network
#'
#[Input]
#' @param i counter variable over the list of genes
#' @param dedges network as a list of edges between gene entrezids
#' @param lgene list of genes mapping the SNPs in R
#' @param R.snps list of SNPs in the relationship matrix
#' @param tmap table for mapping SNPs to genes
#'
#[Output]
#' @return Nested list of SNP positions to be connected by edges
#'
#' @md
#' @author Luis G. Leal, \email{lgl15@@imperial.ac.uk}
#' @family Preprocessing functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
rownet.hub = function(i = NULL, dedges = NULL,lgene = NULL, R.snps = NULL, tmap = NULL, lhubs = NULL){
print(cat("Finding SNP-SNP interactions for gene", i, "\n"))
#Declare lists of SNPs of gene i and SNPs of gene i's interactors
pos.a.hub = pos.b.hub = NULL; pos.a.snp = pos.b.snp = NULL
#Interactors of protein i (Rows of protein i in the table dedges)
a = c( which( dedges[,1] %in% lgene[i]), which( dedges[,2] %in% lgene[i] ))
#Find the SNPs of gene i and map them to the list of snps total (i.e. rownames of R)
pos.a.hub = which( R.snps %in% tmap$refsnp_id[ tmap$entrezgene %in% lgene[i] & tmap$refsnp_id %in% lhubs ])
pos.a.snp = which( R.snps %in% tmap$refsnp_id[ tmap$entrezgene %in% lgene[i]])
for(j in i:length(lgene)){ #Including SNP-SNP interactions within the same gene
##Interactors of protein j (Rows of protein j)
b = c( which( dedges[,1] %in% lgene[j] ), which( dedges[,2] %in% lgene[j] ) )
if( sum(a%in%b)>0 ){ # If they share any interaction
##Find the SNPs of gene j and map them to the list of snps total (i.e. rownames of R)
pos.b.hub = c(pos.b.hub, which( R.snps %in% tmap$refsnp_id[ tmap$entrezgene %in% lgene[j] & tmap$refsnp_id %in% lhubs ]))
if(i == j){
pos.b.snp = which( R.snps %in% tmap$refsnp_id[ tmap$entrezgene %in% lgene[j] ])
}
}
}
return(list(pos.a.snp, pos.b.snp, pos.a.hub, pos.b.hub))
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Constructs the SNV-SNV adjacency matrix
#' @description Function to construct the adjacency matrix Wu for the SNV-SNV network
#'
#[Input]
#' @param R.snps list of SNPs
#' @param trait.project project's name to be included in the file name
#' @param tmap mapping of SNPs to gene entrezids
#' @param ncores number of cores for parallel computing
#' @param lhubs list of damaging variants
#'
#[Output]
#' @return \code{Wu}, \code{G} adjacency matrix and graph object of the network
#'
#' @md
#' @author Luis G. Leal, \email{lgl15@@imperial.ac.uk}
#' @family Preprocessing functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
construct.Wu = function(R.snps = NULL, work.dat = NULL, trait.project = NULL,
dedges = NULL, n.cores = NULL, net.type,
tmap = NULL, lhubs = NULL,
alledges.snp = FALSE #Include edges between proteins for all SNPs or only for hubs
)
{
cat("Filtering list of genes to match those mapped in R", "\n")
#Filter tmap to include only snps in R
tmap = tmap[tmap$refsnp_id %in% R.snps, ]
#Define list of genes shared by the SNPs
tmap = data.frame(tmap)
lgene = unique(tmap$entrezgene)
lgene = lgene[!is.na(lgene)]
#Filter list of genes to match those mapped in R
lgene.s = lgene[ lgene %in% tmap$entrezgene[ tmap$refsnp_id %in% R.snps] ]
cat("Number of genes mapping R:", length(lgene.s), "\n", as.character(Sys.time()), "\n")
#Register the number of cores for parallel computing
if( is.null(n.cores) ){
doMC::registerDoMC(cores = (detectCores() - 2 ))
}else{
doMC::registerDoMC(cores = n.cores)
}
cat("Finding edges in the PPI network for each gene", "\n", as.character(Sys.time()), "\n")
#Create the adjacency matrix
Wu = matrix(0, nrow = length(R.snps), ncol = length(R.snps))
#Find edges
#Find the edges at each row
cat("Creating SNP-SNP network with hubs", "\n")
rows.Wu <- foreach(i = 1:(length(lgene.s))) %dopar% {
rows.Wu = rownet.hub(i = i, dedges = dedges, lgene = lgene.s, R.snps = R.snps, tmap = tmap, lhubs = lhubs)}
cat("Filling the adjacency matrix", "\n", as.character(Sys.time()), "\n")
for(i in 1:length(rows.Wu)){
#Edges between snps in the same protein
if( length( rows.Wu[[i]][[1]] ) > 0 & length( rows.Wu[[i]][[2]] ) > 0 ){
Wu[ rows.Wu[[i]][[2]] , rows.Wu[[i]][[1]] ] <- 1 / (length ( rows.Wu[[i]][[1]] ) - 1)
Wu[ rows.Wu[[i]][[1]] , rows.Wu[[i]][[2]] ] <- 1 / (length ( rows.Wu[[i]][[1]] ) - 1)
}
#Edges between snps and hubs in the same protein
if( length( rows.Wu[[i]][[1]] ) > 0 & length( rows.Wu[[i]][[3]] ) > 0 ){
Wu[ rows.Wu[[i]][[1]] , rows.Wu[[i]][[3]] ] <- 2 / (length ( rows.Wu[[i]][[1]] ) - 1)
Wu[ rows.Wu[[i]][[3]] , rows.Wu[[i]][[1]] ] <- 2 / (length ( rows.Wu[[i]][[1]] ) - 1)
}
#Edges between hubs (either in same protein or between proteins)
if( length( rows.Wu[[i]][[3]] ) > 0 & length( rows.Wu[[i]][[4]] ) > 0 ){
Wu[ rows.Wu[[i]][[3]] , rows.Wu[[i]][[4]] ] <- 2
Wu[ rows.Wu[[i]][[4]] , rows.Wu[[i]][[3]] ] <- 2
}
} #End loop across pairs of genes
#Transform the matrix to an adjacency matrix
cat("Changing adjacency matrix class", "\n", as.character(Sys.time()), "\n")
Wu = as.matrix(Wu, matrix.type = c("adjacency"))
diag(Wu) <- 0
#Add rownames and column names
rownames(Wu) <- colnames(Wu) <- R.snps
#Print results
Gu = graph.adjacency(Wu, mode="undirected", weighted = TRUE)
#Write table with edges and weights
cat("Printing list of edges to", paste(work.dat, "/Gu_",net.type, "_", trait.project, ".txt",sep=""), "\n", as.character(Sys.time()), "\n")
write.graph(Gu,file = paste(work.dat, "/Gu_", net.type, "_", trait.project, ".txt",sep=""), format = "ncol")
#Save graph object
save(list = c("Wu","Gu"),file = paste(work.dat, "/Gu_", net.type, "_", trait.project,".RData",sep=""))
return(list(Wu,Gu))
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Plot node degree of a network
#' @description Function to plot node degree distribution and calculate some graph variables
#'
# [ Input ]
#' @param Gu graph object
#' @param gamma.kd gamma parameter for a law power distribution of node degree
#' @param weighted are the edges weighted?
#'
# [ Output ]
#' @return Print graph variables and plot node degree distribution
#'
#' @md
#' @author Luis G. Leal, \email{lgl15@@imperial.ac.uk}
#' @family Preprocessing functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
degree.distribution = function(Gu, gamma.kd, weighted = FALSE){
#Calculate network variables
if( weighted == TRUE ){
rsum = rowSums(Gu)
nE = sum(Gu != 0)
nV = sum( rsum > 0 )
cat("Total edges", nE, "\n")
cat("Total nodes", nV, "\n")
cat("Average node degree", sum(rsum)/nV, "\n")
hist(rsum, breaks = 50, xlab = "Weighted node degree", las = 1, main = "", col = "light blue")
}else{
nE = length(E(Gu))
nV = length(V(Gu))
cat("Total edges", nE, "\n")
cat("Total nodes", nV, "\n")
cat("Average node degree", nE/nV, "\n")
#Calculate node degree distribution and its frequencies
kd.snps = degree(Gu)
kd = as.numeric(names(table(kd.snps)))
p.kd = as.numeric(table(kd.snps))
tkd = cbind(kd,p.kd)
colnames(tkd) <- c("kd", "frequency")
print(head(tkd))
print(tail(tkd))
#Plot degree distribution
plot(kd, p.kd , xlab = "Node degree (k)", ylab = "Frequency", las = 1, pch = 16, col = "darkgrey")
pred.p.kd = (kd ^ (-gamma.kd)) * nE
lines(kd, pred.p.kd, lty = gamma.kd, col = "blue")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.