BiocStyle::markdown()
options(width=100, max.print=1000)
knitr::opts_chunk$set(
    eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")),
    cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")))
suppressPackageStartupMessages({
    library(longevityTools) 
}) 

Getting Started

Installation

The longevityTools package can be installed from the R console using the following biocLite install command.

source("http://bioconductor.org/biocLite.R") # Sources the biocLite.R installation script 
biocLite("tgirke/longevityTools", build_vignettes=FALSE) # Installs package from GitHub

Loading package and documentation

library("longevityTools") # Loads the package
library(help="longevityTools") # Lists package info
vignette(topic="longevityTools_eDRUG", package="longevityTools") # Opens vignette

Import custom functions.

Preliminary functions that are still under developement, or not fully tested and documented can be imported with the source command from the inst/extdata directory of the package.

fctpath <- system.file("extdata", "longevityTools_SNPformat_Fct.R", package="longevityTools")
source(fctpath)

Introduction

This vignette is part of the NIA funded Longevity Genomics project. For more information on this project please visit its website here. The GitHub repository of the corresponding R package is available here.

In this vignette, data formatting is described to get from SNP genotype data to imputed dosage data based on the 1000 Genomes Phase 1 version 3 reference panel.

The SNP genotype data for MrOS and SOF are stored in NetCDF files, hereafter termed ncdf. Ncdf files are designed to store large array-like data and facilitates quick access to segments of the data along the dimensions of the array. The SNP genotype data is stored in a 2 dimensional array (SNP dimension and sample dimension). Plain text files provide annotation along the 2 dimensions. A SNP annotation file describes the variables along the SNP dimension, and the sample annotation file describes the variables along the sample dimension.

Generate SNP annotation file matching Will Rayner's strand files

Will Rayner's strand file download

Will Rayner at the Wellcome Trust created files indicating strand and allele information for SNPs for many commercial SNP arrays. Will's files are recommended for use by the UM imputation server. The first task is to align strand of the SNP annotation file to Will Rayner's strand file.

Read in Rayner strand file for the Illumina HumanOmni1-Quad_v1-0_H. H refers to the Illumina SNP manifest version. Manifest H is the most recent from Illumina. Version 1 refers to the beadpool they used. Only one beadpool was used for all HumanOmni1-Quad arrays, hence, all arrays of this type are beadpool version 1.0.

#Will Rayner H b37 strand file

#1,100,170 probes
RaynerStrand<-read.table("M:/work/longevity/U24/GWAScohorts/mrOS/arrayAnnotation/WillRayner/HumanOmni1-Quad_v1-0_H-b37-strand/humanomni1-quad_v1-0_h-b37.strand",header=F,stringsAsFactors=F,comment.char="")
names(RaynerStrand)<-c("SNPID","chr","pos","match","strand","alleles")
names(RaynerStrand)<-paste0(names(RaynerStrand),"_Rayner")

SNP annotation file for MrOS array data.

Survey probes with missing rsIDs, most would be excluded anyway.

snpAnnot<-read.table("E:/MROS_SOF_201109/SNPs_related/SNP_ANNOT_09132011.txt",header=T,stringsAsFactors=F,comment.char="",sep="\t")
sum(is.na(snpAnnot$b37_C_rs.id)) #1672 missing

sum(snpAnnot$CNVProbe[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$CNVProbe[is.na(snpAnnot$b37_C_rs.id)]))#19
sum(snpAnnot$CNV_ZEROED[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$CNV_ZEROED[is.na(snpAnnot$b37_C_rs.id)])) #1084
sum(snpAnnot$INDEL_PROBE[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$INDEL_PROBE[is.na(snpAnnot$b37_C_rs.id)]) ) #0
#1103 of the SNPs with missing b37Crsid's are CNV probes, zeroed, or indels. No big loss.
snpAnnot<-snpAnnot[!is.na(snpAnnot$b37_C_rs.id),] #1,138,747 remaining

Merge SNP annotation to Will Rayner's

snpAnnot<-merge(snpAnnot,RaynerStrand,by.x="b37_C_rs.id",by.y="SNPID_Rayner")
#1,100,170 probes, so all Rayner probes matched
sapply(snpAnnot,function(x) sum(is.na(x))) #45,028 missings for alle.A.b37_H
sum(is.na(snpAnnot$alle.A.b37_H[!is.na(snpAnnot$alle.B.b37_H)])) #A and B missing for same SNPs
#are the 45028 SNPs missing for alle.A.b37_H going to be removed anyway? Yes! They are all CNV probes!
sum(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]))#45,028
sum(snpAnnot$CNV_ZEROED[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$CNV_ZEROED[is.na(snpAnnot$alle.A.b37_H)])) #45,028
sum(snpAnnot$INDEL_PROBE[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$INDEL_PROBE[is.na(snpAnnot$alle.A.b37_H)]) ) #0
table(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]) #All 45028 missings are CNV probes
#strangely, Rayner alleles all AG for SNPs missing b37H alleles
table(snpAnnot$alleles_Rayner[is.na(snpAnnot$alle.A.b37_H)])
#also strangely, there are alleles for CNVprobes.
table(snpAnnot$alleles_Rayner[snpAnnot$CNVProbe==TRUE ])
#non-CNV and non-indel alleles are all bi-allelic
table(snpAnnot$alleles_Rayner[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE])
table(snpAnnot$alle.A.b37_H[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE])
table(snpAnnot$alle.B.b37_H[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE])

with(snpAnnot,sum(alle.A.b37_C != alle.A.b37_H & !is.na(alle.A.b37_C) & !is.na(alle.A.b37_H)))
#Allele A differs between C and H manifests for 5,067 SNPs

SNP QC filtering

Remove CNV and Indels.

UM imputation server requires biallelic variants.

#remove CNV and INDELs
sum(snpAnnot$CNV_ZEROED==TRUE)
sum(is.na(snpAnnot$CNV_ZEROED))
snpAnnot<-snpAnnot[snpAnnot$CNV_ZEROED==FALSE,] #908,323 probes
sum(snpAnnot$INDEL_PROBE==TRUE)
sum(is.na(snpAnnot$INDEL_PROBE))
snpAnnot<-snpAnnot[snpAnnot$INDEL_PROBE==FALSE,] #907,956 probes

#After removing CNV and InDels, no allele missings
sum(is.na(snpAnnot$alle.A.b37_H))
sum(is.na(snpAnnot$alle.B.b37_H))
sum(is.na(snpAnnot$alleles_Rayner)) 

posCmp function

The posCmp function from LongevityTools compares positions from our MrOS SNP annotation with Will Rayner's. Disagreements in chr or position in 7,433 of the 907,956 probes (.8%). Downstream steps rely on Imputation checking tool based on Will Rayner's annotation, so will use his chr and pos.

myCompare<-posCmp(snpAnnot$position_b37_H,snpAnnot$pos_Rayner,snpAnnot$Chr_b37_H,snpAnnot$chr_Rayner)
snpAnnot<-cbind(snpAnnot,myCompare)
with(snpAnnot,table(chrDisagree,posDisagree))#4677 chr and pos disagree, 2756 posDisagree but not chr, 0 chr disagree but not pos
length(snpAnnot$Chr_b37_H[ (snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) | (snpAnnot$posDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE)  ]) #1942 that I would have kept
table(snpAnnot$Chr_b37_H[ (snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) | (snpAnnot$posDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE)  ])
#Probes with disagreement in chr or position are evenly distributed across chromosomes
head(snpAnnot[snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE,c("b37_C_rs.id","Chr_b37_H","chr_Rayner","position_b37_H")])
length(grep("^rs",snpAnnot$b37_C_rs.id[snpAnnot$MACH_KEEP_b37_H==TRUE],invert=TRUE)) #23,132 non-rs SNPs

Remove SNPs not on autosomes or X based on Rayner info

table(snpAnnot$Chr_b37_H)
table(snpAnnot$chr_Rayner)
sum(is.na(snpAnnot$chr_Rayner)) #0 missing
snpAnnot<-snpAnnot[snpAnnot$chr_Rayner!="MT",]
snpAnnot<-snpAnnot[snpAnnot$chr_Rayner!="Y",]

Split Rayner alleles.

Rayner alleles are in a single variable. Split them out into two variables for easier manipulation.

#since I removed CNV and indels, all probes are biallelic SNPs
sum(nchar(snpAnnot$alleles_Rayner)!=2)#0
sum(nchar(snpAnnot$alle.A.b37_C)!=1) #0
sum(nchar(snpAnnot$alle.B.b37_C)!=1) #0
table(snpAnnot$alleles_Rayner) #lowest alphabet letter listed first
table(snpAnnot$alle.A.b37_H)
table(snpAnnot$alle.B.b37_H)
snpAnnot$Rayner_A<-substr(snpAnnot$alleles_Rayner,1,1)
snpAnnot$Rayner_B<-substr(snpAnnot$alleles_Rayner,2,2)

strandCheck function

Using strandCheck function from LongevityTools, mark SNPs from the MrOS SNP annotation file that have alleles on the minus strand (flipped). 2074 SNPs on minus strand.

myFlip<-strandCheck(snpAnnot$alle.A.b37_H,snpAnnot$alle.B.b37_H,snpAnnot$Rayner_A,snpAnnot$Rayner_B,snpAnnot$strand_Rayner )
snpAnnot<-cbind(snpAnnot,myFlip)
sum(snpAnnot$flipped) #2074

strandFlip function

Using strandFlip function from LongevityTools, flip strand alleles that are represented on the minus strand. After flipping minus strand SNPs, check the SNPs again to make sure none are flagged as being on the minus strand. None are flagged.

myFlip<-strandFlip(snpAnnot$alle.A.b37_H,snpAnnot$alle.B.b37_H,snpAnnot$flipped)
snpAnnot<-cbind(snpAnnot,myFlip)
snpAnnot$A1<-as.character(snpAnnot$A1)
snpAnnot$B1<-as.character(snpAnnot$B1)

myFlip<-strandCheck(snpAnnot$A1,snpAnnot$B1,snpAnnot$Rayner_A,snpAnnot$Rayner_B,snpAnnot$strand_Rayner )
sum(myFlip[,1]) #0 SNPs! Perfect.
sum(myFlip[,2]) #0 SNPs! Perfect.
with(snpAnnot,table(alleles_Rayner[flipped==1]))
with(snpAnnot,table(alleles_Rayner))
table(with(snpAnnot,paste0(A1,B1)))

Allele possibilities

Noticed there are more allele combos in A1+B1 vs alleles_Rayner. Why? GC and TA are just rearranged. However, TC and TG are not in alleles_Rayner. TC and TG in A1+B1 are - strand on strand_Rayner. See below.

table(snpAnnot$strand_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="C"])
table(snpAnnot$strand_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="G"])
#As expected, they match if you flip strands
table(snpAnnot$alleles_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="C"]) #AG
table(snpAnnot$alleles_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="G"]) #AC

SNP QC filtering

Remove SNPs with > 1 discordant call between duplicate samples, missing rate > 2.5%, autosomal HWE P-value < 10-6, MAF < 0.01. Output new SNP annotation file.

#discordant calls
sum(is.na(snpAnnot$Discordant_Calls_1))
table(snpAnnot$Discordant_Calls_1)
snpAnnot<-snpAnnot[snpAnnot$Discordant_Calls_1==FALSE,]

#missing rate
sum(is.na(snpAnnot$missing.n1))
sum(snpAnnot$missing.n1>0.025)
snpAnnot<-snpAnnot[snpAnnot$missing.n1<=0.025, ]

#HWE
sum(is.na(snpAnnot$HWE_pvalue))
table(snpAnnot$chr_Rayner[is.na(snpAnnot$HWE_pvalue)]) #missing HWE on 1, 11, and X
sum(is.na(snpAnnot$HWE_auto_REMOVE))
sum(is.na(snpAnnot$HWE_X_REMOVE))
#I won't use auto and X because I don't want to apply different HWE cut-offs for autosomes and X.
sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) )#2056
sum(snpAnnot$HWE_pvalue<0.0001 & !is.na(snpAnnot$HWE_pvalue) )#3175
sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) & snpAnnot$chr_Rayner=="X")
sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) & snpAnnot$chr_Rayner!="X")
snpAnnot<-snpAnnot[!is.na(snpAnnot$HWE_pvalue),] #removed 9
snpAnnot<-snpAnnot[snpAnnot$HWE_pvalue>=0.000001 | snpAnnot$chr_Rayner=="X",]

#MAF
sum(is.na(snpAnnot$maf))
sum(snpAnnot$maf<0.01 )
snpAnnot<-snpAnnot[snpAnnot$maf>=0.01,]

write.table(snpAnnot,file="//HOMER/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_DSE.txt",row.names=F,quote=F)

Chromosome-specific SNP annotation files

Split SNP annotation file by chromosome to make ncdf extraction easier.

snpAnnot2<-snpAnnot[,c("chr_Rayner","b37_C_rs.id","pos_Rayner","A1","B1","int.id")]
myChr<-c(1:22,"X")

for (i in myChr){
  snpAnnot3<-snpAnnot2[snpAnnot2$chr_Rayner==i,]
  snpAnnot3<-snpAnnot3[order(snpAnnot3$pos_Rayner),]
  write.table(snpAnnot3,file=paste0("//HOMER/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_chr",i,".txt"),row.names=F,quote=F)
}

Extract genotypes for each chromosome

NetCDF data access using ncdf4 package

library('ncdf4')

chrNames<-c(1:22,"X")

t1 <- proc.time()
for(j in 1:length(chrNames)) {

  study<-"MrOS"  #MrOS or SOF
  nc <- nc_open('/media/homer/MROS_SOF/MROS_SOF_201109/NetCDF_files_Sept2011/MrOS_SOF_SW.OMNI.GENO.09062011.nc', readunlim=F)
  snpAnnot<-read.table(paste0("/media/homer/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_chr",chrNames[j],".txt"),header=T,stringsAsFactors=F)
  samp<-read.table("/media/homer/MROS_SOF/MROS_SOF_201109/samples_related/sampleAnnotDan2013-7-18.txt",header=T,stringsAsFactors=F,sep="\t")
  row1<-sum(samp$collection==study & samp$worseDup==0 & samp$noConsent==0)
  col1<-length(snpAnnot$int.id)

  myDat<-matrix(nrow=row1,ncol=col1)

  for(i in 1:length(snpAnnot$int.id)){
    myStartSNP<-snpAnnot$int.id[i]

    mySNP <- ncvar_get(nc, "genotype", start=c(myStartSNP,1), count=c(1,-1))
    mySNP <- as.vector(mySNP)
    mySNP <- mySNP[samp$collection==study & samp$worseDup==0 & samp$noConsent==0]
    mySNP[mySNP==0]<-paste(snpAnnot$B1[i],snpAnnot$B1[i],sep=" ")
    if (chrNames[j]=="X"){
      #blank out Male X-chromosome hets
      mySNP[mySNP==1]<-"0 0"
      } else  mySNP[mySNP==1]<-paste(snpAnnot$A1[i],snpAnnot$B1[i],sep=" ")
    mySNP[mySNP==2]<-paste(snpAnnot$A1[i],snpAnnot$A1[i],sep=" ")
    mySNP[mySNP==-1]<-"0 0"

    myDat[,i]<-mySNP
  }
  nc_close(nc)

  myDat2<-matrix(nrow=row1,ncol=6)
  #myDat2[,1]<-samp$HA_ID[samp$collection==study & samp$worseDup==0 & samp$noConsent==0]
  #myDat2[,2]<-samp$HA_ID[samp$collection==study & samp$worseDup==0 & samp$noConsent==0]
  myDat2[,1]<-samp$individual.id[samp$collection==study & samp$worseDup==0 & samp$noConsent==0]
  myDat2[,2]<-samp$individual.id[samp$collection==study & samp$worseDup==0 & samp$noConsent==0]
  myDat2[,3]<-rep(0,row1)
  myDat2[,4]<-rep(0,row1)
  if(study=="MrOS") {
    myDat2[,5]<-rep(1,row1)
    } else if (study=="SOF") {
      myDat2[,5]<-rep(2,row1)
    } else print("study wrong")
  myDat2[,6]<- -9
  myDat3<-cbind(myDat2,myDat)
  write.table(myDat3,file=paste("/media/homer/MROS_SOF/plink2016/",study,"/MrOSID/",study,"chr",chrNames[j],".ped",sep=""),row.names=F,col.names=F,quote=F,sep="\t")
  print(j)
  map<-data.frame(chr=snpAnnot$chr_Rayner,rs=snpAnnot$b37_C_rs.id,genetic=rep(0,col1),position=snpAnnot$pos_Rayner,stringsAsFactors=F)
  write.table(map,file=paste("/media/homer/MROS_SOF/plink2016/",study,"/MrOSID/",study,"chr",chrNames[j],".map",sep=""),row.names=F,col.names=F,quote=F,sep="\t")

}

print(proc.time() - t1)

Pre-imputation checking of PLINK files

Rayner pre-imputation tool

Then used Will Rayner's Imputation preparation and checking tool

Also checks AF concordance with 1000G EUR phase 3 data. Results below.

Options Set:
Reference Panel:             1000G
Bim filename:                /home/devans/longevity/U24/GWAScohorts/MrOS/bin/MrOSallWhites.bim
Reference filename:          /home/devans/longevity/U24/GWAScohorts/rayner/1000GP_Phase3_combined.legend
Allele frequencies filename: /home/devans/longevity/U24/GWAScohorts/MrOS/frq/MrOSallWhites.frq
Allele frequency threshold:  0.2
Population for 1000G:        EUR


Matching to 1000G

Position Matches
 ID matches 1000G 0
 ID Doesn't match 1000G 734860
 Total Position Matches 734860
ID Match
 Different position to 1000G 1906
No Match to 1000G 2741
Skipped (X, XY, Y, MT) 0
Total in bim file 739528
Total processed 739507

Indels (ignored in r1) 0

SNPs not changed 211132
SNPs to change ref alt 518174
Strand ok 729292
Total Strand ok 729306

Strand to change 15
Total checked 736766
Total checked Strand 729307
Total removed for allele Frequency diff > 0.2 198
Palindromic SNPs with Freq > 0.4 2704


Non Matching alleles 4755
ID and allele mismatching 4745; where 1000G is . 0
Duplicates removed 21

Will's tool generates plink scripts to flip strands and switch reference/other allele.

Convert plink to sorted vcf

Convert PLINK files to sorted bgzipped vcf for the UM imputation server.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
#!/bin/bash
#PBS -t 1-22
#PBS -l mem=15gb
#PBS -l walltime=8:00:00
#PBS -q batch

dataPath=~/bigdata/U24/GWAScohorts/MrOS/geno_vcf

cd $dataPath

module load plink
module load vcftools

plink --ped ${dataPath}/MrOS_chr${PBS_ARRAYID}.ped --map ${dataPath}/MrOS_chr${PBS_ARRAYID}.map --recode vcf --out ${dataPath}/MrOS_chr${PBS_ARRAYID}

vcf-sort ${dataPath}/MrOS_chr${PBS_ARRAYID}.vcf | bgzip -c > ${dataPath}/MrOS_chr${PBS_ARRAYID}.vcf.gz

Post-imputation formatting

Convert from vcf to dosage

VCF files contain best guess genotypes, genotype probabilites, and allele dosages. I want dosages.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#!/bin/bash

#PBS -t 1-22
#PBS -l mem=10gb
#PBS -l walltime=8:00:00
#PBS -q batch

dataPath=~/bigdata/U24/GWAScohorts/MrOS/vcf1KG

cd $dataPath

~/DosageConvertor/bin/DosageConvertor --vcfDose ${dataPath}/chr${PBS_ARRAYID}.dose.vcf.gz --info ${dataPath}/chr${PBS_ARRAYID}.info.gz --prefix chr${PBS_ARRAYID} --type plink --format DS

Duplicate InDels

Tried loading in PLINK, but errors, reporting duplicate IDs. Turns out InDels have same SNPIDs as SNPs at same position because SNPID is chr:pos. To solve this, I pasted alleles to the SNP identifiers, which distinguishes InDels from SNPs. However, some InDel alleles were extremely long (>200 kb), so I subsetted the alleles to the shortest possible nucleotide length that uniquely identified each variant.

#using data.table to modify plink files
library(data.table,lib.loc="/rhome/devans/Rlibs")

chrIndex<-1:22
pbs_arrayid <- as.integer(Sys.getenv("PBS_ARRAYID"))
myChr<-chrIndex[pbs_arrayid]

map<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".plink.map"))
setnames(map, c("chr","SNP","gen","pos") )

doseHeader<-scan("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/headerDosage.txt",sep="\t",what="character")
dat<-fread(paste0("gunzip -c ~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".plink.dose.gz"),header=F,skip=1,col.names=doseHeader)

dat[(A1!="A" & A1!="C" & A1!="T" & A1!="G") | (A2!="A" & A2!="C" & A2!="T" & A2!="G" ) , SNP := paste0(SNP,"[b37]",A1,",",A2) ]
#pasting alleles got rid of duplicate SNPIDs
#create order variable so I can always retain original order
dat[,SNPorder := 1:dat[,.N] ]
map[,SNPorder := 1:map[,.N] ]  

#check for duplicate SNP IDs after truncating to first 100
#if dups found, check each 100 bp up to 2000
#are all SNPs with nchar >100 indels? Yes. 
#assigns result to x. 
x<-dat[nchar(SNP)>100] [SNP %like% "b37", .N ]  
if ( dat[nchar(SNP)>100, .N] != x){
  print (paste0("Detected SNPs with >100 chars in chr ",myChr)) 
} else {
  print(paste0("No SNPs >100 chars in chr ",myChr))
}
dat[, SNPtest := substr(SNP,1,100)   ]
#SNPtest variable contains first 100 characters of SNP. I want to use this as the SNPID. 
#By subsetting to first 100, could create duplicate IDs. Check if dups exist, and if so, include more BPs.
#Only if there were dups in the 100 character subset, create new SNPtest with minimum number of BPs until there are no dups. 
if(dat[duplicated(SNPtest), .N] > 0){
  result<-sapply(seq(200,2000,by=100), function(x) {
    dat[, SNPtest := substr(SNP,1,x)   ]
    dat[duplicated(SNPtest) , .N  ]    
  })

  if(sum(result==0)>0){
    myStop<-min(seq(200,2000,by=100)[ which(result==0)])
    dat[ ,SNPtest:=substr(SNP,1,myStop) ]
  } else print(paste0("Duplicate SNP IDs after 2000 bp in chr ",myChr)) 
} 

#set keys to order by SNP order
setkey(dat,SNPorder)
setkey(map,SNPorder)
#cbind, single column of dat is a vector
map[ , SNPnew := dat[,SNPtest] ]
#this also works, subset is a data.table, but is easily added
#map[ , SNPnew := dat2[,.(SNPtest)] ]

map <- map[ ,.(chr,SNPnew,gen,pos) ]
myNumCol <- ncol(dat)
dat <- dat[ ,c(myNumCol, 2:(myNumCol-2) ) , with=FALSE]

fwrite(map, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map"),sep="\t",col.names=FALSE, verbose=TRUE )
fwrite(dat, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),sep="\t",col.names=FALSE, verbose=TRUE )

Job submission script.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
#!/bin/bash

#PBS -l mem=200gb
#PBS -l walltime=48:00:00
#PBS -q highmem
#PBS -j oe

cd $PBS_O_WORKDIR

Rscript --vanilla ./uniqueSNPchr.R

Very long (>2 kb) duplicate InDels

Even after that, there were InDel dups for 3 chromosomes: 6, 14, and 19. First InDel of the dups was retained.

library(data.table,lib.loc="/rhome/devans/Rlibs")

myChr<-6L
#myChr<-14L
#myChr<-19L

map<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map"))
setnames(map, c("chr","SNP","gen","pos") )
map[,SNPorder := 1:map[,.N] ]  

#cbind info to map, remove duplicate from map. Then, use SNPorder to subset dose file.
info<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".info") )
names(info)[1] <- "SNPinfo"
map2<-cbind(map,info)
#identical(map2[,SNPorder], 1:902461)
map2<-map2[!duplicated(SNP)]
myRows<-map2[,c(SNPorder)] #rows with non-duplicated SNPIDs

#myDup<-map2[duplicated(SNP),c(SNP) ]
#map2[SNP %in% myDup ,which=TRUE] which argument returns the matching row numbers
#map2[SNP %in% myDup ,]
#myDup2<- base::strsplit(myDup,"[",fixed=TRUE )[[1]][1]

doseHeader<-scan("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/headerDosage.txt",sep="\t",what="character")
dat<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),header=F,col.names=doseHeader)

dat<-dat[myRows]

fwrite(map2, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.Info.map"),sep="\t",col.names=FALSE, verbose=TRUE  )

map2 <- map2[ ,.(chr,SNP,gen,pos) ]

fwrite(map2, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map"),sep="\t",col.names=FALSE, verbose=TRUE )
fwrite(dat, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),sep="\t",col.names=FALSE, verbose=TRUE )

Phenotype based on survival percentiles

SSA data

Birth cohort and gender-specific survival percentiles from the US Social Security Administration are provided in Table 7 here. The age at the Xth survival percentile is the youngest age at which lx <= (1-X)*100,000. For the 90th survival percentile, lx <= 0.1*100,000 = 10,000.

Linear predictor for each year

SSA data is given by decade. A linear predictor was used to impute age at each year.

dat<-read.csv("~/Documents/longevity/1KGlongevity/ages.csv",header=T,stringsAsFactors=F)
dat$birthYear<-dat$birthYear - 1900

myPredictor<-function(x){
  lm1<-lm(x~birthYear,data=dat)
  newdata<-with(dat,expand.grid(
    birthYear = seq(min(birthYear),max(birthYear),by=1 ),
    age = 0
  ))
  newdata$age <- round(predict(lm1,newdata))
  return(newdata$age)
}

result<-sapply(dat[,2:9],myPredictor)
myBirthYear = seq(min(dat$birthYear),max(dat$birthYear),by=1 )+1900
result<-cbind(myBirthYear,result)
write.csv(result,file="~/Documents/longevity/1KGlongevity/agesPredict.csv",quote=F,row.names=F)

Apply percentiles to follow-up data

efDat<-read.csv("M:/work/mrOS/dataPheno/endpoints/efaug15.csv",header=T,stringsAsFactors=F)
V1dat<-read.csv("M:/work/mrOS/dataPheno/V1FEB14.csv",header=T,stringsAsFactors=F)
V1dat<-V1dat[,c("ID","SITE","GIAGE1")]
ageLUT<-read.csv("M:/work/longevity/chargeAging/1KGlongevity/AP/lifetables/agesPredict.csv",header=T,stringsAsFactors=F)
fam<-read.table("M:/work/longevity/chargeAging/1KGlongevity/MrOS/data/chr21.plink.fam",header=F,stringsAsFactors = F)
names(fam)<-c("FID","IID","father","mother","sex","pheno")

dat<-merge(efDat,V1dat,by="ID")
dat$EFDATE<-as.Date(dat$EFDATE, format="%m/%d/%Y")
dat$cohort<-as.numeric(format(dat$EFDATE,"%Y")) - dat$GIAGE1

dat$finalAge<-dat$GIAGE1 + dat$FUCYTIME

#provides age cut-off based on birthyear cohort
dat$male90cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male90"]
dat$male99cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male99"]
dat$male60cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male60"]
#no missings, meaning there were matches for all birth cohorts.

dat$cases90<-0
dat$cases90[dat$finalAge>=dat$male90cut]<-1

dat$cases99<-0
dat$cases99[dat$finalAge>=dat$male99cut]<-1

dat$control60dead<-0
dat$control60dead[dat$EFSTATUS==1 & dat$finalAge<=dat$male60cut] <-1

dat$control60alive<-0
dat$control60alive[dat$EFSTATUS!=1 & dat$finalAge<=dat$male60cut] <-1

dat$control60DA<-0
dat$control60DA[dat$finalAge<=dat$male60cut] <-1

#output data for all MrOS subjects, even those without GWAS
write.csv(dat,file="M:/work/longevity/chargeAging/1KGlongevity/MrOS/data/MrOSsurvivalPercentiles.csv",row.names=F,quote=F)

Association analysis

Run analysis using PLINK

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
#!/bin/bash

#PBS -t 1-22
#PBS -l mem=14gb
#PBS -l walltime=10:00:00
#PBS -j oe

module load plink

cd ~/bigdata/U24/CHARGElongevity/dead90/scripts
plink --memory 10000 --pheno ../data/dead90.txt --covar ../data/dead90cov.txt --allow-no-sex --fam ~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr${PBS_ARRAYID}.plink.fam --map ~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr${PBS_ARRAYID}.plink.map --dosage ~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr${PBS_ARRAYID}.plink.dose.gz Zout noheader format=1 --out ../results/dead90chr${PBS_ARRAYID}

module unload plink


tgirke/longevityTools documentation built on May 31, 2019, 9:07 a.m.