if (! requireNamespace("readr")) { install.packages("readr") }
setwd("/Users/Kevin/Desktop/Bioinformatics/BCB420/GEOSystemBio/data") list.files() filename_1 <- "GDS5810.txt" openfile_1<- file(filename_1, open = "r")
filename_2 <-"GDS5810_clean.txt" openfile_2<- file(filename_2,open = "w") line <-readLines(openfile_1) tx2 <-gsub(pattern="^(!.|#.)",replace="", line) #remove the comment starting #with "!","#","" writeLines(tx2,con=openfile_2) close(openfile_1) close(openfile_2)
tmp <- readr::read_tsv(file.path("../data", "GDS5810_clean.txt"),comment="^",col_names =TRUE,skip_empty_rows=TRUE)
head(tmp)
ID_REF IDENTIFIER GSM1588481 GSM1588485 GSM1588482 GSM1588486 GSM1588483 GSM1588487 GSM1588484
nrow(tmp) #33297 entries in this dataset;
GEO_Unique_IDENTIFIER <-unique(tmp$IDENTIFIER) #26804 IDENTIFIERS which are also GEO_Unique_IDENTIFIER #potential HGNC symbol have been #retrived
sum(duplicated(tmp$IDENTIFIER)) #Indeed there are 6493 of duplication;
duplicated_IDENTIFIER <- tmp$IDENTIFIER[duplicated(tmp$IDENTIFIER)] duplicated_IDENTIFIER tmp[tmp$IDENTIFIER %in% duplicated_IDENTIFIER, ]
# A tibble: 7,826 x 10
ID_REF IDENTIFIER GSM1588481 GSM1588485 GSM1588482 GSM1588486 GSM1588483
tmp[tmp$IDENTIFIER %in% c("OR4F17","OR4F29","M37726"), ]
ID_REF IDENTIFIER GSM1588481 GSM1588485 GSM1588482 GSM1588486 GSM1588483
tmp <- tmp[ ! (tmp$IDENTIFIER %in% duplicated_IDENTIFIER), ]
any(duplicated(tmp$IDENTIFIER)) now result is FALSE;
nrow(tmp) # total of 25471 unique IDENTIFERs has been cleaned 33297 - 25471 = # 7826 of duplicated entries has been removed;
sum(is.na(tmp$IDENTIFIER)) # there are nothing identifier shows as na; sum(tmp$IDENTIFIER =="") # there are nothing space value in IDENTIFIER;
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
biomart_HGNC_Table <- biomaRt::getBM(filters = "hgnc_symbol",
attributes = "hgnc_symbol",
values =tmp$IDENTIFIER,
mart = myMart)
head(biomart_HGNC_Table)
1 AADACL3 2 AADACL4 3 ACADM 4 ACOT11 5 ACTL8 6 ACTRT2
nrow(biomart_HGNC_Table) #there are 17798 of Identifiers have been retrived nrow(tmp)
biomart_HGNC_Table_IDENTIFIER <- biomart_HGNC_Table$hgnc_symbol
unmapped_tmp <- tmp[ ! (tmp$IDENTIFIER %in% biomart_HGNC_Table_IDENTIFIER),]
head(unmapped_tmp)
ID_REF IDENTIFIER GSM1588481 GSM1588485 GSM1588482 GSM1588486 GSM1588483
nrow(unmapped_tmp) #total of 7673 unmapped symbol;
view(unmapped_tmp)
unmapped_tmp_chr_location <- unmapped_tmp$IDENTIFIER[grepl("chr",unmapped_tmp$IDENTIFIER)] length(unmapped_tmp_chr_location) #there are total of 6381 entries
unmapped_tmp_LOC <- unmapped_tmp$IDENTIFIER[grepl("LOC",unmapped_tmp$IDENTIFIER)] length(unmapped_tmp_LOC) #there are total of 357 entries
unmapped_tmp_previous_HGNC <-unmapped_tmp$IDENTIFIER[!(grepl("chr",unmapped_tmp$IDENTIFIER))] length(unmapped_tmp_previous_HGNC)
unmapped_tmp_previous_HGNC <-unmapped_tmp$IDENTIFIER[!(grepl("chr|LOC",unmapped_tmp$IDENTIFIER))] length(unmapped_tmp_previous_HGNC) #total of 935 symbols;
GEO_IDENTIFIER <- tmp$IDENTIFIER
names(GEO_IDENTIFIER) <- biomart_HGNC_Table head(GEO_IDENTIFIER)
GEO_MAP_TO_HGNC <- tmp$IDENTIFIER GEO_MAP_TO_HGNC names(GEO_MAP_TO_HGNC) <- biomart_HGNC_Table_IDENTIFIER
GEO_MAP_TO_HGNC
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.