Nothing
create_caprion <- function()
{
HOME <- Sys.getenv("HOME")
load(paste(HOME,"Caprion","pilot","caprion.rda",sep="/"))
caprion <- protein_list
library(biomaRt)
library(dplyr)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
gene_ensembl <- getBM(
attributes = c("hgnc_symbol", "ensembl_gene_id", "chromosome_name", "start_position", "end_position"),
filters = "hgnc_symbol",
values = pQTLdata::caprion$Gene,
mart = ensembl
)
caprion_ensembl <- left_join(caprion, filter(gene_ensembl,chromosome_name %in% c(paste(1:22),"X")),
by = c("Gene" = "hgnc_symbol")) %>%
group_by(Gene,Protein,Accession,Protein.Description,GO.Cellular.Component,GO.Function,GO.Process) %>%
summarize(ensGenes=paste(ensembl_gene_id,collapse=";"),
chrom=unique(chromosome_name),
starts=paste(start_position,collapse=";"),
ends=paste(end_position,collapse=";"),
start=min(start_position),end=max(end_position))
caprion_ensembl[-c(1,4:7)] |> filter(ensGenes=="NA")
## A tibble: 12 × 8
## Groups: Protein, Accession [12]
# Protein Accession ensGenes chrom starts ends start end
# <chr> <chr> <chr> <chr> <chr> <chr> <int> <int>
# 1 AMY1_HUMAN P04745 NA NA NA NA NA NA
# 2 CT027_HUMAN Q9GZN8 NA NA NA NA NA NA
# 3 CO4B_HUMAN P0C0L5 NA NA NA NA NA NA
# 4 CG026_HUMAN Q96N11 NA NA NA NA NA NA
# 5 CD97_HUMAN P48960 NA NA NA NA NA NA
# 6 FA49B_HUMAN Q9NUQ9 NA NA NA NA NA NA
# 7 GLCM_HUMAN P04062 NA NA NA NA NA NA
# 8 HBA_HUMAN P69905 NA NA NA NA NA NA
# 9 H4_HUMAN P62805 NA NA NA NA NA NA
#10 NHRF1_HUMAN O14745 NA NA NA NA NA NA
#11 SYWC_HUMAN P23381 NA NA NA NA NA NA
#12 SYYC_HUMAN P54577 NA NA NA NA NA NA
Sys.setlocale("LC_CTYPE", "en_US.UTF-8")
library(httr)
library(jsonlite)
accession_list <- c("P04745", "Q9GZN8", "P0C0L5", "Q96N11", "P48960",
"Q9NUQ9", "P04062", "P69905", "P62805", "O14745",
"P23381", "P54577")
base_url <- "https://www.uniprot.org/uniprot/"
results <- list()
for (accession in accession_list) {
url <- paste0(base_url, accession, ".json")
response <- GET(url)
print(paste("Accessing:", url))
print(paste("Status Code:", status_code(response)))
if (status_code(response) == 200) {
raw_content <- content(response, "text")
utf8_content <- iconv(raw_content, from = "latin1", to = "UTF-8")
data <- fromJSON(utf8_content)
print(paste("Data for:", accession))
print(data)
if ("inactiveReason" %in% names(data)) {
merged_ids <- data$inactiveReason$mergeDemergeTo
if (length(merged_ids) > 0) {
results[[accession]] <- merged_ids
} else {
results[[accession]] <- "No merged IDs"
}
} else {
results[[accession]] <- "Active, no merged IDs"
}
} else {
results[[accession]] <- paste("Error: Status code", status_code(response))
}
}
print(results)
library(httr)
library(jsonlite)
base_url <- "https://www.uniprot.org/uniprot/"
all_ensembl_gene_ids <- list()
for (accession in accession_list) {
url <- paste0(base_url, accession, ".json")
response <- GET(url)
if (status_code(response) == 200) {
raw_content <- content(response, "text")
utf8_content <- iconv(raw_content, from = "latin1", to = "UTF-8")
data <- fromJSON(utf8_content) # Parse JSON
ensembl_ids <- NULL
if ("genes" %in% names(data)) {
for (gene in data$genes) {
if ("identifier" %in% names(gene)) {
if (grepl("ENSEMBL", gene$identifier)) {
ensembl_ids <- c(ensembl_ids, gene$identifier)
}
}
}
}
if (is.null(ensembl_ids) || length(ensembl_ids) == 0) {
if ("secondaryAccessions" %in% names(data)) {
ensembl_ids <- data$secondaryAccessions
}
}
if (is.null(ensembl_ids) || length(ensembl_ids) == 0) {
all_ensembl_gene_ids[[accession]] <- "Ensembl gene ID not available"
} else {
all_ensembl_gene_ids[[accession]] <- ensembl_ids
}
} else {
all_ensembl_gene_ids[[accession]] <- paste("Error: Status code", status_code(response))
}
}
print(all_ensembl_gene_ids)
all_ensembl_gene_ids$P04745 <- results$P04745
uniprot_list <- list(
P04745 = c("P0DTE7", "P0DUB6", "P0DTE8"),
Q9GZN8 = c("A8K4J0", "D3DVX8", "Q5JX81", "Q9NWX3"),
P0C0L5 = c("A2BHY4", "P01028", "P78445", "Q13160", "Q13906", "Q14033", "Q14835", "Q6U2E9",
"Q6U2G1", "Q6U2I5", "Q6U2L1", "Q6U2L7", "Q6U2L9", "Q6U2M5", "Q6VCV8", "Q96SA7", "Q9NPK5", "Q9UIP5"),
Q96N11 = c("Q9BQ43"),
P48960 = c("A8K7Z4", "B2RBJ9", "O00718", "O76101", "Q8NG72", "Q8TBQ7"),
Q9NUQ9 = c("Q96AZ5", "Q9NW21", "Q9NZE7"),
P04062 = c("A8K796", "B7Z5G2", "B7Z6S1", "J3KQG4", "J3KQK9", "Q16545", "Q4VX22", "Q6I9R6", "Q9UMJ8"),
P69905 = c("P01922", "Q1HDT5", "Q3MIF5", "Q53F97", "Q96KF1", "Q9NYR7", "Q9UCM0"),
P62805 = c("A2VCL0", "P02304", "P02305", "Q6DRA9", "Q6FGB8", "Q6NWP7"),
O14745 = c("B3KY21", "O43552", "Q86WQ5"),
P23381 = c("A6NGN1", "A6NID3", "P78535", "Q502Y0", "Q53XB6", "Q9UDI5", "Q9UDL3"),
P54577 = c("B3KWK4", "D3DPQ4", "O43276", "Q53EN1")
)
get_gene_positions <- function(uniprot_ids) {
results <- getBM(attributes = c("uniprotswissprot", "ensembl_gene_id",
"hgnc_symbol", "chromosome_name",
"start_position", "end_position"),
filters = "uniprotswissprot",
values = uniprot_ids,
mart = ensembl)
return(results)
}
get_gene_positions_ensembl <- function(uniprot_ids) {
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
results <- getBM(
attributes = c("uniprotswissprot", "ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position"),
filters = "uniprotswissprot",
values = uniprot_ids,
mart = ensembl
)
return(results)
}
all_gene_positions <- list()
for (uniprot_id in accession_list) {
gene_positions <- get_gene_positions_ensembl(uniprot_id)
if (nrow(gene_positions) > 0) {
all_gene_positions[[uniprot_id]] <- gene_positions
} else {
all_gene_positions[[uniprot_id]] <- "No data found"
}
}
all_gene_positions$P04745 <- get_gene_positions(uniprot_list$P04745)
print(all_gene_positions)
do.call("rbind",all_gene_positions)
all_gene_positions[[1]]$ uniprotswissprot <- "P04745"
collapsed_data <- lapply(all_gene_positions, function(df) {
df %>%
mutate(chromosome_name = if_else(grepl("HS", chromosome_name), "6", as.character(chromosome_name))) %>%
group_by(uniprotswissprot) %>%
summarise(
ensembl_gene_id = paste(ensembl_gene_id, collapse = ";"),
hgnc_symbol = paste(hgnc_symbol, collapse = ";"),
chrs = paste(unique(chromosome_name), collapse = ";"),
starts = paste(start_position, collapse = ";"),
ends = paste(end_position, collapse = ";"),
start = min(as.numeric(start_position), na.rm = TRUE),
end = max(as.numeric(end_position), na.rm = TRUE)
)
})
print(collapsed_data)
caprion_modified <- left_join(caprion_ensembl, do.call("rbind", collapsed_data), by = c("Accession" = "uniprotswissprot")) %>%
mutate(ensGenes = na_if(ensGenes, "NA"),
starts = coalesce(starts.x, starts.y),
ends = coalesce(ends.x, ends.y),
ensGenes = coalesce(ensGenes, ensembl_gene_id),
Gene = coalesce(Gene, hgnc_symbol),
Gene.orig = coalesce(Gene, hgnc_symbol),
chr = coalesce(chrom, chrs),
start = coalesce(start.x, start.y),
end = coalesce(end.x, end.y)) %>%
select(-c(chrs, starts.x, ends.x, starts.y, ends.y, start.x, start.y, end.x, end.y, hgnc_symbol, ensembl_gene_id))
caprion <- caprion_modified
lines <- c(46,137,441,452)
caprion[lines,"Gene"]
caprion[lines,"Gene"] <- c("AMY1","C4B","HBA","HIST")
save(caprion,file='caprion.rda',compress='xz')
}
caprion_legacy <- function()
{
# Check the resulting data frame
print(caprion_modified)
# https://www.uniprot.org/
# P04745 -> P0DTE7, P0DTE8, and P0DUB6
manual_check <- '
P04745,AMY1A;AMY1B;AMY1C,
Q9GZN8,ADISSP,
P0C0L5,C4B;C4B_2,
Q96N11,INTS15,
P48960,ADGRE5,
Q9NUQ9,CYRIB,
P04062,GBA1,
P69905,HBA1;HBA2,
P62805,H4C1;H4C2;H4C3;H4C4;H4C5;H4C6;H4C8;H4C9;H4C11;H4C12;H4C13;H4C14;H4C15;H4C16,
O14745,NHERF1,
P23381,WARS1,
P54577,YARS1'
library(pQTLdata)
nodup <- function(x) sapply(x, function(s) unique(unlist(strsplit(s,";")))[1])
ucsc <- hg19Tables %>%
group_by(acc) %>%
summarize(
prot=paste(uniprotName,collapse=";"),
chrom=paste(X.chrom,collapse=";"),
start=min(chromStart),
end=max(chromEnd),
gene=paste(geneName,collapse=";")
)
# uniprot IDs are the same if proteins are the same
p <- select(caprion,Accession,Protein,Gene) %>%
left_join(ucsc,by=c("Protein"="prot")) %>%
select(Accession,Protein,Gene,gene,acc,chrom,start,end)
# however even with same uniprotID their protein names may be different
u <- select(caprion_ensembl,-c(chrom,start,end)) %>%
left_join(ucsc,by=c("Accession"="acc")) %>%
mutate(chrom=nodup(chrom)) %>%
filter(!is.na(Protein))
# The following check shows merge by uniprot is more sensible
filter(p,Accession!=acc)
filter(p,Gene!=gene)
filter(p,is.na(start))
filter(u,Protein!=prot)
umiss <- with(u,is.na(start))
pull(u[umiss,],Accession)
# "P04745" "P02655" "P55056" "P0C0L5" "P62805" "P69905"
# confirmed form UniProt.org that Gene is more up-to-date
# (obsolute), (APOC2), (APOC4), (C4B; C4B_2), (H4C1; H4C2; H4C3; H4C4; H4C5; H4C6; H4C8; H4C9; H4C11; H4C12; H4C13; H4C14; H4C15; H4C16), (HBA1; HBA2)
# They are amended according to glist-hg19 in the function below.
u[umiss,"Protein"] <- paste0(c("AMY1","APOC2","APOC4","CO4B","H4","HBA"),"_HUMAN")
u[umiss,"Gene"] <- c("AMY1","APOC2","APOC4","CO4B","H4C","HBA")
u[umiss,"chrom"] <- c("chr1","chr19","chr19","chr6","chr6","chr16")
u[umiss,"start"] <- c(104198140,45449238,45445494,31949833,26021906,222845)
u[umiss,"end"] <- c(104301311,45452822,45452822,32003195,27841289,227520)
caprion_modified <- rename(u,chrom.u=chrom,start.u=start,end.u=end)
a <- u[umiss,] %>%
transmute(acc=Accession,prot=Protein,gene=Gene,chrom,start,end)
ucsc2 <- ucsc %>%
mutate(prot=nodup(prot),chrom=nodup(chrom),gene=nodup(gene)) %>%
bind_rows(a)
caprion_ucsc_modified <- left_join(caprion_modified,ucsc2) %>%
mutate(chrom=if_else(is.na(chrom),paste(unique(strsplit(chrom.u, ";")[[1]]), collapse = ";"),chrom),
start=if_else(is.na(start),start.u,start),
end=if_else(is.na(end),end.u,end)) %>%
select(-c(acc,prot,gene,chrom.u,start.u,end.u))
quadruple <- function(d,label)
with(d,data.frame(Gene=label,chrom=paste0("chr",min(chr)),start=min(start),end=max(end)))
glist_refgene <- function()
{
# glist-hg19 lookup
INF <- Sys.getenv("INF")
glist_hg19 <- read.table(file.path(INF,"csd3","glist-hg19"),col.names=c("chr","start","end","gene")) %>%
filter(! chr %in% c("XY","Y"))
X <- with(glist_hg19,chr=="X")
glist_hg19 <- mutate(glist_hg19,chrom=paste0("chr",chr))
glist_hg19[X,"chr"] <- "23"
subset(glist_hg19,grepl("^AMY1",gene))
subset(glist_hg19,grepl("^C4B",gene)&chr=="6")
subset(glist_hg19,grepl("^HIST1H4|^HIST2H4[A-B]|HIST4H4",gene))
subset(glist_hg19,grepl("^HBA",gene))
AMY1 <- quadruple(subset(glist_hg19,grepl("^AMY1",gene)),label="AMY")
C4B <- quadruple(subset(glist_hg19,grepl("^C4B",gene)&chr=="6"),label="C4B")
HIST <- quadruple(subset(glist_hg19,grepl("^HIST1H4[A-L]",gene)),label="HIST")
HBA <- quadruple(subset(glist_hg19,grepl("^HBA",gene)),label="HBA")
glist_hg19 <- glist_hg19 %>%
group_by(gene) %>%
summarize(chrom=paste(chrom,collapse=";"),start=min(start),end=max(end)) %>%
mutate(chrom=nodup(chrom))
# refgenes
load("~/cambridge-ceu/turboman/turboman_hg19_reference_data.rda")
refgenes <- refgene_gene_coordinates_h19 %>%
mutate(chromosome=paste0("chr",chromosome)) %>%
setNames(c("chrom","start","end","gene","mid")) %>%
bed_intersect(glist_hg19)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.