dev/gwas/ParseGwasCatalog.r

# Download and process GWAS results from GWAS catalog
ParseGwasCatalog<-function(url="https://www.ebi.ac.uk/gwas/api/search/downloads/full", 
                           path=paste(Sys.getenv("RCHIVE_HOME"), 'data/gwas/public/gwascatalog', sep='/'), 
                           update.all.pubmed=FALSE) {
  # url    Source file URL
  # path   Path to output files
  # update.all.pubmed   Re-download all PubMed entries if TRUE
  
  options()
  
  library(RCurl);
  library(rchive);
  
  if (!file.exists(path)) dir.create(path, recursive=TRUE);
  if (!file.exists(paste(path, 'r', sep='/'))) dir.create(paste(path, 'r', sep='/'), recursive=TRUE);
  if (!file.exists(paste(path, 'src', sep='/'))) dir.create(paste(path, 'src', sep='/'), recursive=TRUE);
  
  # download the table 
  ln<-strsplit(getURL(url), '\n')[[1]];
  gw<-do.call('rbind', strsplit(ln, '\t'));
  colnames(gw)<-gw[1,];
  gw<-gw[-1, ];
  gw<-data.frame(gw, stringsAsFactors=FALSE);
  
  # this will download an outdated version of GWAScatalog, 
  # but its column names were used across the rest of this function
  # so it's downloaded just to provide the column names
  library(NCBI2R);
  gw0<-GetPublishedGWAS(); 
  if (ncol(gw) == ncol(gw0)) colnames(gw)<-colnames(gw0) else warning("Unmatched column names between versions!")
  
  # save a copy of full table
  fn.src<-sapply(strsplit(url, '://'), function(x) x[2]);
  fn.src<-gsub('/', '_', fn.src);
  saveRDS(gw, file=paste(fn.src, '.rds', sep=''));
  
  ##########################################################################################################
  # start working on the table
  
  # fields of numeric values
  numeric.fields<-c('Chr_id', 'Chr_pos', 'Merged', 'Intergenic', 'RiskAlleleFrequency', 'pValue', 'Pvalue_mlog', 'ORorbeta'); # fields shold be numeric
  num<-gw[, numeric.fields];
  num<-apply(num, 2, as.numeric);
  gw[, numeric.fields]<-num;
  
  # fields must have valid value
  required.fields<-c('Snp_id_current', 'Chr_id', 'Chr_pos', 'Pvalue_mlog');
  req<-gw[, required.fields];
  n<-apply(req, 2, function(x) is.na(x) | x=='');
  cnv<-gw[gw$CNV=='Y', , drop=FALSE]; # CNV variants
  inc<-gw[rowSums(n)>0, , drop=FALSE];
  gw<-gw[gw$CNV!='Y' & rowSums(n)==0, , drop=FALSE]; 
  saveRDS(cnv, file=paste(path, 'r', 'gwascatalog_just_cnv.rds', sep='/'));
  saveRDS(inc, file=paste(path, 'r', 'gwascatalog_incomplete_info.rds', sep='/'));
  saveRDS(gw, file=paste(path, 'r', 'gwascatalog_complete_info.rds', sep='/'));
  
  #####################
  gw<-data.frame(id=paste('rs', gw[['Snp_id_current']], sep=''), phred=as.integer(round(10*abs(gw[['Pvalue_mlog']]))), gw, stringsAsFactors=FALSE);
  #####################
  
  # rename chromosomes
  chr<-as.character(gw[['Chr_id']]);
  chr[chr=='23']<-'X';
  chr[chr=='24']<-'Y';
  chr[chr=='25']<-'MT';
  gw[['Chr_id']]<-chr;
  
  # the combination of SNP ID, PubMed ID, Trait, and p value text should be unique
  dup.fields<-c('ReportedGenes', 'Mapped_gene', 'Region', 'Snp_gene_ids'); # the fields that might cause duplicated entries
  gw<-gw[!duplicated(gw[, !(colnames(gw) %in% dup.fields)]), ];
  uniq<-paste(gw[['id']], gw[['PUBMEDID']], gw[['DiseaseTrait']], gw[['pValuetext']], sep='@_@');
  dup<-uniq[duplicated(uniq)];# duplicated entries
  dups<-lapply(dup, function(dup) gw[uniq==dup, ]);
  dups<-sapply(dups, function(d) rownames(d[order(d[, 'phred'])])[nrow(d)]); # pick one with the highest phred score
  gw<-rbind(gw[!(uniq %in% dup), ], gw[dups,]); # duplicated enries removed
  gw<-gw[!is.na(gw$phred), ]; # Entry will be removed if reported p value is 0
  
  ###############################################################################################################
  ###############################################################################################################
  cat('Re-formatting data ...\n');
  
  # re-format p value text
  txt<-gw[['pValuetext']];
  txt[is.na(txt)]<-'';
  txt<-gsub(' ', '', txt);
  txt<-gsub("[()]", '', txt);
  #anno[,4]<-txt;
  
  # unique analysis and study
  ana<-paste(gw[['PUBMEDID']], gw[['DiseaseTrait']], gw[['pValuetext']], sep='@_@'); # analyses
  sty<-split(ana, gw[['PUBMEDID']]); # studies
  sty<-lapply(sty, unique);
  sty<-lapply(sty, sort);
  
  # analysis ID
  ana.nm<-unlist(sty, use.names=FALSE); # analysis names
  ana.id<-paste('pmid', rep(names(sty), sapply(sty, length)), '_', unlist(sapply(sty, function(x) 1:length(x))), sep=''); # analysis ID
  names(ana.id)<-ana.nm;
  id<-as.vector(ana.id[ana]); # analysis ID of each row
  
  # SNP positions
  pos<-gw[, c('id', 'Chr_id', 'Chr_pos')];
  pos<-pos[!duplicated(pos[[1]]), ];
  pos<-pos[order(pos[,2], pos[,3]), ];
  
  # output table
  tbl<-matrix(nr=length(unique(gw$id)), nc=length(ana.id), dimnames=list(as.vector(pos[,1]), sort(ana.id)));
  
  # phred score of variants in each analysis
  phred<-gw$phred;
  names(phred)<-as.vector(gw$id);
  phred<-split(phred, id);
  for (i in 1:length(phred)) {
    v<-phred[[i]];
    p<-split(as.vector(v), names(v));
    p<-sapply(p, max); 
    tbl[names(p), names(phred)[i]]<-as.integer(p);
    #print(i); 
  }
  
  # Phred score matrix
  pos[[3]]<-as.integer(pos[[3]]);
  tbl<-data.frame(pos, tbl);
  rownames(tbl)<-as.vector(tbl[[1]]);
  #colnames(tbl)[1:3]<-c('id', 'chr', 'pos');
  tbl<-tbl[, 4:ncol(tbl)];
  saveRDS(tbl, file=paste(path, 'r/phred_gwascatalog.rds', sep='/'));
  
  ###############################################################################################################
  ###############################################################################################################
  cat('Getting study information from Pubmed ...\n');
  
  # annotate analyses
  #an<-gw[!duplicated(id), c(4, 7, 6, 8, 35, 9, 10, 11, 12)]; # select fields
  an<-gw[!duplicated(id), ]; # one analysis, one row
  rownames(an)<-id[!duplicated(id)];
  sty.id<-paste('pmid', an$PUBMEDID, sep=''); # Non-unique study ID
  st.id<-unique(sty.id); # unique study ID
  
  # Get article info from pubmed
  if (!update.all.pubmed & file.exists(paste(path, 'r/pubmed_downloaded.rds', sep='/'))) { # existing info
    print('Loading existing pubmed information ...');
    pm<-readRDS(paste(path, 'r/pubmed_downloaded.rds', sep='/'));
    id<-sub('^pmid', '', st.id);
    new.pub<-id[!(id %in% names(pm))];
    if (length(new.pub) > 0) { # there are new pubmed articles
      new.pm<-GetPubMedAbstract(new.pub);
      names(new.pm)<-new.pub;
      saveRDS(new.pm, file=paste(path, 'r/pubmed_new_article.rds', sep='/'));
      pm<-c(pm, new.pm);
      saveRDS(pm,file=paste(path, 'r/pubmed_downloaded.rds', sep='/'));
    }
  } else {
    pm<-GetPubMedAbstract(sub('^pmid', '', st.id));
    names(pm)<-sub('^pmid', '', st.id);
    saveRDS(pm,file=paste(path, 'r/pubmed_downloaded.rds', sep='/'));
  }    
  
  # PubMed table
  pubmed<-GetPubMedFields(pm);
  saveRDS(pubmed, file=paste(path, 'r/pubmed.rds', sep='/'));
  
  id2pub<-lapply(rownames(pubmed), function(id) c(ID=id, pubmed[id, ]));
  names(id2pub)<-rownames(pubmed);
  saveRDS(id2pub, file=paste(path, 'r/pubmed_by_id.rds', sep='/'));
  
  ###############################################################################################################
  ###############################################################################################################
  cat('Putting together analysis and study tables ...\n');
  
  # Number of phred scores of each analysis
  n<-sapply(rownames(an), function(id) {
    p<-tbl[, id];
    length(p[!is.na(p)]);
  });
  mx<-apply(tbl, 2, function(x) max(x, na.rm=TRUE));
  mn<-apply(tbl, 2, function(x) min(x, na.rm=TRUE));
  mx<-sapply(rownames(an), function(id) max(tbl[, id], na.rm=TRUE));
  mn<-sapply(rownames(an), function(id) min(tbl[, id], na.rm=TRUE));
  
  # analysis annotation table
  title<-pubmed[, 'Title'];
  names(title)<-paste('pmid',rownames(pubmed), sep='');
  desc<-paste(as.vector(an$Study), 'Disease trait:', as.vector(an$DiseaseTrait));
  ana<-data.frame(Source=rep('GWAScatalog', nrow(an)), Name=as.vector(title[sty.id]), Study_ID=sty.id,
                  URL=paste('http://www.ncbi.nlm.nih.gov/pubmed/', sub('^pmid', '', sty.id), sep=''),
                  Num_Variants=as.vector(n), Min_Phred=as.vector(mn), Max_Phred=as.vector(mx), Description=desc, row.names=rownames(an));
  ana<-ana[order(rownames(ana)), , drop=FALSE];
  saveRDS(ana, file=paste(path, 'r/analysis.rds', sep='/'))
  
  # study anotation table
  n.ana<-as.vector(table(ana$Study)[paste('pmid', rownames(pubmed), sep='')]);
  n.ana[is.na(n.ana)]<-0;
  std<-data.frame(Source=rep('GWAScatalog', nrow(pubmed)), Name=as.vector(pubmed$Title), Num_Analyses=n.ana, URL=as.vector(pubmed$URL), 
                  Description=as.vector(pubmed$Abstract), row.names=paste('pmid', rownames(pubmed), sep=''), stringsAsFactors=FALSE);
  saveRDS(std[order(rownames(std)),], file=paste(path, 'r/study.rds', sep='/')); 
  
  ## full analysis info by ID
  a<-cbind(ana[rownames(an), ], an, PubMed=sub('pmid', '', as.vector(ana[rownames(an), 'Study_ID'])));
  cnm<-c( # fields to be included
    Name = 'Name',
    'Study ID' = 'Study_ID',
    URL = 'URL',
    'PubMed' = 'PubMed',
    'Number of variants' = 'Num_Variants',
    'Minimum Phred score' = 'Min_Phred',
    'Maximum Phred score' = 'Max_Phred',
    Trait = 'DiseaseTrait',
    Journal = 'Journal',
    Date = 'Date',
    Author = 'FirstAuthor',
    Platform = 'Platform',
    'Initial sample' = 'InitialSampleDescription',
    'Replication sample' = 'ReplicationSampleDescription'
  )
  cnm<-cnm[cnm %in% colnames(a)];
  a<-a[, cnm];
  for (i in 1:ncol(a)) a[[i]]<-as.vector(a[[i]]);
  a[is.na(a)]<-'';
  colnames(a)<-names(cnm);
  # create list
  id2ana<-lapply(rownames(a), function(id) {
    flds<-a[id, ];
    sid<-as.vector(ana[id, 'Study_ID']);
    as.list(c(ID = id, Source='GWAScatalog', flds, 'Study Name' = as.vector(std[sid, 'Name']), 'Study full description' = as.vector(std[sid, 'Description'])));
    
  });
  names(id2ana)<-rownames(a);
  id2ana<-id2ana[order(names(id2ana))];
  saveRDS(id2ana, file=paste(path, 'r/analysis_by_id.rds', sep='/'))
  
  ## full study info by ID
  id2std<-lapply(rownames(std), function(id) {
    list(
      ID = id,
      Source = 'GWAScatalog',
      Name = std[id, 'Name'],
      URL = std[id, 'URL'],
      PubMed = sub('pmid', '', id),
      'Number of analyses' = std[id, 'Num_Analyses'],
      Analyses = rownames(ana)[ana$Study_ID==id & !is.na(ana$Study_ID)],
      Journal = pubmed[sub('^pmid', '', id), 'Journal'],
      Date = pubmed[sub('^pmid', '', id), 'Date'],
      'Full description' = std[id, 'Description']
    )
  });
  names(id2std)<-rownames(std);
  saveRDS(id2std[order(names(id2std))], file=paste(path, 'r/study_by_id.rds', sep='/'));
  
  ana2pm<-lapply(id2ana, function(x) x$PubMed);
  std2pm<-lapply(id2std, function(x) x$PubMed);
  saveRDS(ana2pm, file=paste(path, 'r/analysis2pubmed.rds', sep='/'));
  saveRDS(std2pm, file=paste(path, 'r/study2pubmed.rds', sep='/'));
  
  list(
    snp = rownames(tbl),
    analysis = rownames(ana),
    study = rownames(std),
    pubmed = rownames(pubmed)
  )
}
zhezhangsh/rchive documentation built on June 17, 2020, 3:55 a.m.