BiocStyle::markdown() options(width=100, max.print=1000) knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")))
suppressPackageStartupMessages({ library(longevityTools) })
The longevityTools
package can be installed from the R console using the following biocLite
install command.
source("http://bioconductor.org/biocLite.R") # Sources the biocLite.R installation script biocLite("tgirke/longevityTools", build_vignettes=FALSE) # Installs package from GitHub
library("longevityTools") # Loads the package library(help="longevityTools") # Lists package info vignette(topic="longevityTools_eDRUG", package="longevityTools") # Opens vignette
Preliminary functions that are still under developement, or not fully tested and documented
can be imported with the source
command from the inst/extdata
directory of the package.
fctpath <- system.file("extdata", "longevityTools_SNPformat_Fct.R", package="longevityTools") source(fctpath)
This vignette is part of the NIA funded Longevity Genomics project. For more information on this project please visit its website here. The GitHub repository of the corresponding R package is available here.
In this vignette, data formatting is described to get from SNP genotype data to imputed dosage data based on the 1000 Genomes Phase 1 version 3 reference panel.
The SNP genotype data for MrOS and SOF are stored in NetCDF files, hereafter termed ncdf. Ncdf files are designed to store large array-like data and facilitates quick access to segments of the data along the dimensions of the array. The SNP genotype data is stored in a 2 dimensional array (SNP dimension and sample dimension). Plain text files provide annotation along the 2 dimensions. A SNP annotation file describes the variables along the SNP dimension, and the sample annotation file describes the variables along the sample dimension.
Will Rayner at the Wellcome Trust created files indicating strand and allele information for SNPs for many commercial SNP arrays. Will's files are recommended for use by the UM imputation server. The first task is to align strand of the SNP annotation file to Will Rayner's strand file.
Read in Rayner strand file for the Illumina HumanOmni1-Quad_v1-0_H. H refers to the Illumina SNP manifest version. Manifest H is the most recent from Illumina. Version 1 refers to the beadpool they used. Only one beadpool was used for all HumanOmni1-Quad arrays, hence, all arrays of this type are beadpool version 1.0.
#Will Rayner H b37 strand file #1,100,170 probes RaynerStrand<-read.table("M:/work/longevity/U24/GWAScohorts/mrOS/arrayAnnotation/WillRayner/HumanOmni1-Quad_v1-0_H-b37-strand/humanomni1-quad_v1-0_h-b37.strand",header=F,stringsAsFactors=F,comment.char="") names(RaynerStrand)<-c("SNPID","chr","pos","match","strand","alleles") names(RaynerStrand)<-paste0(names(RaynerStrand),"_Rayner")
Survey probes with missing rsIDs, most would be excluded anyway.
snpAnnot<-read.table("E:/MROS_SOF_201109/SNPs_related/SNP_ANNOT_09132011.txt",header=T,stringsAsFactors=F,comment.char="",sep="\t") sum(is.na(snpAnnot$b37_C_rs.id)) #1672 missing sum(snpAnnot$CNVProbe[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$CNVProbe[is.na(snpAnnot$b37_C_rs.id)]))#19 sum(snpAnnot$CNV_ZEROED[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$CNV_ZEROED[is.na(snpAnnot$b37_C_rs.id)])) #1084 sum(snpAnnot$INDEL_PROBE[is.na(snpAnnot$b37_C_rs.id)]==TRUE & !is.na(snpAnnot$INDEL_PROBE[is.na(snpAnnot$b37_C_rs.id)]) ) #0 #1103 of the SNPs with missing b37Crsid's are CNV probes, zeroed, or indels. No big loss. snpAnnot<-snpAnnot[!is.na(snpAnnot$b37_C_rs.id),] #1,138,747 remaining
snpAnnot<-merge(snpAnnot,RaynerStrand,by.x="b37_C_rs.id",by.y="SNPID_Rayner") #1,100,170 probes, so all Rayner probes matched sapply(snpAnnot,function(x) sum(is.na(x))) #45,028 missings for alle.A.b37_H sum(is.na(snpAnnot$alle.A.b37_H[!is.na(snpAnnot$alle.B.b37_H)])) #A and B missing for same SNPs #are the 45028 SNPs missing for alle.A.b37_H going to be removed anyway? Yes! They are all CNV probes! sum(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]))#45,028 sum(snpAnnot$CNV_ZEROED[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$CNV_ZEROED[is.na(snpAnnot$alle.A.b37_H)])) #45,028 sum(snpAnnot$INDEL_PROBE[is.na(snpAnnot$alle.A.b37_H)]==TRUE & !is.na(snpAnnot$INDEL_PROBE[is.na(snpAnnot$alle.A.b37_H)]) ) #0 table(snpAnnot$CNVProbe[is.na(snpAnnot$alle.A.b37_H)]) #All 45028 missings are CNV probes #strangely, Rayner alleles all AG for SNPs missing b37H alleles table(snpAnnot$alleles_Rayner[is.na(snpAnnot$alle.A.b37_H)]) #also strangely, there are alleles for CNVprobes. table(snpAnnot$alleles_Rayner[snpAnnot$CNVProbe==TRUE ]) #non-CNV and non-indel alleles are all bi-allelic table(snpAnnot$alleles_Rayner[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE]) table(snpAnnot$alle.A.b37_H[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE]) table(snpAnnot$alle.B.b37_H[snpAnnot$CNV_ZEROED==FALSE & snpAnnot$INDEL_PROBE==FALSE]) with(snpAnnot,sum(alle.A.b37_C != alle.A.b37_H & !is.na(alle.A.b37_C) & !is.na(alle.A.b37_H))) #Allele A differs between C and H manifests for 5,067 SNPs
UM imputation server requires biallelic variants.
#remove CNV and INDELs sum(snpAnnot$CNV_ZEROED==TRUE) sum(is.na(snpAnnot$CNV_ZEROED)) snpAnnot<-snpAnnot[snpAnnot$CNV_ZEROED==FALSE,] #908,323 probes sum(snpAnnot$INDEL_PROBE==TRUE) sum(is.na(snpAnnot$INDEL_PROBE)) snpAnnot<-snpAnnot[snpAnnot$INDEL_PROBE==FALSE,] #907,956 probes #After removing CNV and InDels, no allele missings sum(is.na(snpAnnot$alle.A.b37_H)) sum(is.na(snpAnnot$alle.B.b37_H)) sum(is.na(snpAnnot$alleles_Rayner))
The posCmp function from LongevityTools compares positions from our MrOS SNP annotation with Will Rayner's. Disagreements in chr or position in 7,433 of the 907,956 probes (.8%). Downstream steps rely on Imputation checking tool based on Will Rayner's annotation, so will use his chr and pos.
myCompare<-posCmp(snpAnnot$position_b37_H,snpAnnot$pos_Rayner,snpAnnot$Chr_b37_H,snpAnnot$chr_Rayner) snpAnnot<-cbind(snpAnnot,myCompare) with(snpAnnot,table(chrDisagree,posDisagree))#4677 chr and pos disagree, 2756 posDisagree but not chr, 0 chr disagree but not pos length(snpAnnot$Chr_b37_H[ (snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) | (snpAnnot$posDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) ]) #1942 that I would have kept table(snpAnnot$Chr_b37_H[ (snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) | (snpAnnot$posDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE) ]) #Probes with disagreement in chr or position are evenly distributed across chromosomes head(snpAnnot[snpAnnot$chrDisagree==1 & snpAnnot$MACH_KEEP_b37_H==TRUE,c("b37_C_rs.id","Chr_b37_H","chr_Rayner","position_b37_H")]) length(grep("^rs",snpAnnot$b37_C_rs.id[snpAnnot$MACH_KEEP_b37_H==TRUE],invert=TRUE)) #23,132 non-rs SNPs
table(snpAnnot$Chr_b37_H) table(snpAnnot$chr_Rayner) sum(is.na(snpAnnot$chr_Rayner)) #0 missing snpAnnot<-snpAnnot[snpAnnot$chr_Rayner!="MT",] snpAnnot<-snpAnnot[snpAnnot$chr_Rayner!="Y",]
Rayner alleles are in a single variable. Split them out into two variables for easier manipulation.
#since I removed CNV and indels, all probes are biallelic SNPs sum(nchar(snpAnnot$alleles_Rayner)!=2)#0 sum(nchar(snpAnnot$alle.A.b37_C)!=1) #0 sum(nchar(snpAnnot$alle.B.b37_C)!=1) #0 table(snpAnnot$alleles_Rayner) #lowest alphabet letter listed first table(snpAnnot$alle.A.b37_H) table(snpAnnot$alle.B.b37_H) snpAnnot$Rayner_A<-substr(snpAnnot$alleles_Rayner,1,1) snpAnnot$Rayner_B<-substr(snpAnnot$alleles_Rayner,2,2)
Using strandCheck function from LongevityTools, mark SNPs from the MrOS SNP annotation file that have alleles on the minus strand (flipped). 2074 SNPs on minus strand.
myFlip<-strandCheck(snpAnnot$alle.A.b37_H,snpAnnot$alle.B.b37_H,snpAnnot$Rayner_A,snpAnnot$Rayner_B,snpAnnot$strand_Rayner ) snpAnnot<-cbind(snpAnnot,myFlip) sum(snpAnnot$flipped) #2074
Using strandFlip function from LongevityTools, flip strand alleles that are represented on the minus strand. After flipping minus strand SNPs, check the SNPs again to make sure none are flagged as being on the minus strand. None are flagged.
myFlip<-strandFlip(snpAnnot$alle.A.b37_H,snpAnnot$alle.B.b37_H,snpAnnot$flipped) snpAnnot<-cbind(snpAnnot,myFlip) snpAnnot$A1<-as.character(snpAnnot$A1) snpAnnot$B1<-as.character(snpAnnot$B1) myFlip<-strandCheck(snpAnnot$A1,snpAnnot$B1,snpAnnot$Rayner_A,snpAnnot$Rayner_B,snpAnnot$strand_Rayner ) sum(myFlip[,1]) #0 SNPs! Perfect. sum(myFlip[,2]) #0 SNPs! Perfect. with(snpAnnot,table(alleles_Rayner[flipped==1])) with(snpAnnot,table(alleles_Rayner)) table(with(snpAnnot,paste0(A1,B1)))
Noticed there are more allele combos in A1+B1 vs alleles_Rayner. Why? GC and TA are just rearranged. However, TC and TG are not in alleles_Rayner. TC and TG in A1+B1 are - strand on strand_Rayner. See below.
table(snpAnnot$strand_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="C"]) table(snpAnnot$strand_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="G"]) #As expected, they match if you flip strands table(snpAnnot$alleles_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="C"]) #AG table(snpAnnot$alleles_Rayner[snpAnnot$A1=="T" & snpAnnot$B1=="G"]) #AC
Remove SNPs with > 1 discordant call between duplicate samples, missing rate > 2.5%, autosomal HWE P-value < 10-6, MAF < 0.01. Output new SNP annotation file.
#discordant calls sum(is.na(snpAnnot$Discordant_Calls_1)) table(snpAnnot$Discordant_Calls_1) snpAnnot<-snpAnnot[snpAnnot$Discordant_Calls_1==FALSE,] #missing rate sum(is.na(snpAnnot$missing.n1)) sum(snpAnnot$missing.n1>0.025) snpAnnot<-snpAnnot[snpAnnot$missing.n1<=0.025, ] #HWE sum(is.na(snpAnnot$HWE_pvalue)) table(snpAnnot$chr_Rayner[is.na(snpAnnot$HWE_pvalue)]) #missing HWE on 1, 11, and X sum(is.na(snpAnnot$HWE_auto_REMOVE)) sum(is.na(snpAnnot$HWE_X_REMOVE)) #I won't use auto and X because I don't want to apply different HWE cut-offs for autosomes and X. sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) )#2056 sum(snpAnnot$HWE_pvalue<0.0001 & !is.na(snpAnnot$HWE_pvalue) )#3175 sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) & snpAnnot$chr_Rayner=="X") sum(snpAnnot$HWE_pvalue<0.000001 & !is.na(snpAnnot$HWE_pvalue) & snpAnnot$chr_Rayner!="X") snpAnnot<-snpAnnot[!is.na(snpAnnot$HWE_pvalue),] #removed 9 snpAnnot<-snpAnnot[snpAnnot$HWE_pvalue>=0.000001 | snpAnnot$chr_Rayner=="X",] #MAF sum(is.na(snpAnnot$maf)) sum(snpAnnot$maf<0.01 ) snpAnnot<-snpAnnot[snpAnnot$maf>=0.01,] write.table(snpAnnot,file="//HOMER/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_DSE.txt",row.names=F,quote=F)
Split SNP annotation file by chromosome to make ncdf extraction easier.
snpAnnot2<-snpAnnot[,c("chr_Rayner","b37_C_rs.id","pos_Rayner","A1","B1","int.id")] myChr<-c(1:22,"X") for (i in myChr){ snpAnnot3<-snpAnnot2[snpAnnot2$chr_Rayner==i,] snpAnnot3<-snpAnnot3[order(snpAnnot3$pos_Rayner),] write.table(snpAnnot3,file=paste0("//HOMER/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_chr",i,".txt"),row.names=F,quote=F) }
library('ncdf4') chrNames<-c(1:22,"X") t1 <- proc.time() for(j in 1:length(chrNames)) { study<-"MrOS" #MrOS or SOF nc <- nc_open('/media/homer/MROS_SOF/MROS_SOF_201109/NetCDF_files_Sept2011/MrOS_SOF_SW.OMNI.GENO.09062011.nc', readunlim=F) snpAnnot<-read.table(paste0("/media/homer/MROS_SOF/plink2016/SNPs_related/SNP_ANNOT_chr",chrNames[j],".txt"),header=T,stringsAsFactors=F) samp<-read.table("/media/homer/MROS_SOF/MROS_SOF_201109/samples_related/sampleAnnotDan2013-7-18.txt",header=T,stringsAsFactors=F,sep="\t") row1<-sum(samp$collection==study & samp$worseDup==0 & samp$noConsent==0) col1<-length(snpAnnot$int.id) myDat<-matrix(nrow=row1,ncol=col1) for(i in 1:length(snpAnnot$int.id)){ myStartSNP<-snpAnnot$int.id[i] mySNP <- ncvar_get(nc, "genotype", start=c(myStartSNP,1), count=c(1,-1)) mySNP <- as.vector(mySNP) mySNP <- mySNP[samp$collection==study & samp$worseDup==0 & samp$noConsent==0] mySNP[mySNP==0]<-paste(snpAnnot$B1[i],snpAnnot$B1[i],sep=" ") if (chrNames[j]=="X"){ #blank out Male X-chromosome hets mySNP[mySNP==1]<-"0 0" } else mySNP[mySNP==1]<-paste(snpAnnot$A1[i],snpAnnot$B1[i],sep=" ") mySNP[mySNP==2]<-paste(snpAnnot$A1[i],snpAnnot$A1[i],sep=" ") mySNP[mySNP==-1]<-"0 0" myDat[,i]<-mySNP } nc_close(nc) myDat2<-matrix(nrow=row1,ncol=6) #myDat2[,1]<-samp$HA_ID[samp$collection==study & samp$worseDup==0 & samp$noConsent==0] #myDat2[,2]<-samp$HA_ID[samp$collection==study & samp$worseDup==0 & samp$noConsent==0] myDat2[,1]<-samp$individual.id[samp$collection==study & samp$worseDup==0 & samp$noConsent==0] myDat2[,2]<-samp$individual.id[samp$collection==study & samp$worseDup==0 & samp$noConsent==0] myDat2[,3]<-rep(0,row1) myDat2[,4]<-rep(0,row1) if(study=="MrOS") { myDat2[,5]<-rep(1,row1) } else if (study=="SOF") { myDat2[,5]<-rep(2,row1) } else print("study wrong") myDat2[,6]<- -9 myDat3<-cbind(myDat2,myDat) write.table(myDat3,file=paste("/media/homer/MROS_SOF/plink2016/",study,"/MrOSID/",study,"chr",chrNames[j],".ped",sep=""),row.names=F,col.names=F,quote=F,sep="\t") print(j) map<-data.frame(chr=snpAnnot$chr_Rayner,rs=snpAnnot$b37_C_rs.id,genetic=rep(0,col1),position=snpAnnot$pos_Rayner,stringsAsFactors=F) write.table(map,file=paste("/media/homer/MROS_SOF/plink2016/",study,"/MrOSID/",study,"chr",chrNames[j],".map",sep=""),row.names=F,col.names=F,quote=F,sep="\t") } print(proc.time() - t1)
Then used Will Rayner's Imputation preparation and checking tool
Also checks AF concordance with 1000G EUR phase 3 data. Results below.
Options Set: Reference Panel: 1000G Bim filename: /home/devans/longevity/U24/GWAScohorts/MrOS/bin/MrOSallWhites.bim Reference filename: /home/devans/longevity/U24/GWAScohorts/rayner/1000GP_Phase3_combined.legend Allele frequencies filename: /home/devans/longevity/U24/GWAScohorts/MrOS/frq/MrOSallWhites.frq Allele frequency threshold: 0.2 Population for 1000G: EUR Matching to 1000G Position Matches ID matches 1000G 0 ID Doesn't match 1000G 734860 Total Position Matches 734860 ID Match Different position to 1000G 1906 No Match to 1000G 2741 Skipped (X, XY, Y, MT) 0 Total in bim file 739528 Total processed 739507 Indels (ignored in r1) 0 SNPs not changed 211132 SNPs to change ref alt 518174 Strand ok 729292 Total Strand ok 729306 Strand to change 15 Total checked 736766 Total checked Strand 729307 Total removed for allele Frequency diff > 0.2 198 Palindromic SNPs with Freq > 0.4 2704 Non Matching alleles 4755 ID and allele mismatching 4745; where 1000G is . 0 Duplicates removed 21
Will's tool generates plink scripts to flip strands and switch reference/other allele.
Convert PLINK files to sorted bgzipped vcf for the UM imputation server.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | #!/bin/bash #PBS -t 1-22 #PBS -l mem=15gb #PBS -l walltime=8:00:00 #PBS -q batch dataPath=~/bigdata/U24/GWAScohorts/MrOS/geno_vcf cd $dataPath module load plink module load vcftools plink --ped ${dataPath}/MrOS_chr${PBS_ARRAYID}.ped --map ${dataPath}/MrOS_chr${PBS_ARRAYID}.map --recode vcf --out ${dataPath}/MrOS_chr${PBS_ARRAYID} vcf-sort ${dataPath}/MrOS_chr${PBS_ARRAYID}.vcf | bgzip -c > ${dataPath}/MrOS_chr${PBS_ARRAYID}.vcf.gz |
VCF files contain best guess genotypes, genotype probabilites, and allele dosages. I want dosages.
1 2 3 4 5 6 7 8 9 10 11 12 | #!/bin/bash #PBS -t 1-22 #PBS -l mem=10gb #PBS -l walltime=8:00:00 #PBS -q batch dataPath=~/bigdata/U24/GWAScohorts/MrOS/vcf1KG cd $dataPath ~/DosageConvertor/bin/DosageConvertor --vcfDose ${dataPath}/chr${PBS_ARRAYID}.dose.vcf.gz --info ${dataPath}/chr${PBS_ARRAYID}.info.gz --prefix chr${PBS_ARRAYID} --type plink --format DS |
Tried loading in PLINK, but errors, reporting duplicate IDs. Turns out InDels have same SNPIDs as SNPs at same position because SNPID is chr:pos. To solve this, I pasted alleles to the SNP identifiers, which distinguishes InDels from SNPs. However, some InDel alleles were extremely long (>200 kb), so I subsetted the alleles to the shortest possible nucleotide length that uniquely identified each variant.
#using data.table to modify plink files library(data.table,lib.loc="/rhome/devans/Rlibs") chrIndex<-1:22 pbs_arrayid <- as.integer(Sys.getenv("PBS_ARRAYID")) myChr<-chrIndex[pbs_arrayid] map<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".plink.map")) setnames(map, c("chr","SNP","gen","pos") ) doseHeader<-scan("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/headerDosage.txt",sep="\t",what="character") dat<-fread(paste0("gunzip -c ~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".plink.dose.gz"),header=F,skip=1,col.names=doseHeader) dat[(A1!="A" & A1!="C" & A1!="T" & A1!="G") | (A2!="A" & A2!="C" & A2!="T" & A2!="G" ) , SNP := paste0(SNP,"[b37]",A1,",",A2) ] #pasting alleles got rid of duplicate SNPIDs #create order variable so I can always retain original order dat[,SNPorder := 1:dat[,.N] ] map[,SNPorder := 1:map[,.N] ] #check for duplicate SNP IDs after truncating to first 100 #if dups found, check each 100 bp up to 2000 #are all SNPs with nchar >100 indels? Yes. #assigns result to x. x<-dat[nchar(SNP)>100] [SNP %like% "b37", .N ] if ( dat[nchar(SNP)>100, .N] != x){ print (paste0("Detected SNPs with >100 chars in chr ",myChr)) } else { print(paste0("No SNPs >100 chars in chr ",myChr)) } dat[, SNPtest := substr(SNP,1,100) ] #SNPtest variable contains first 100 characters of SNP. I want to use this as the SNPID. #By subsetting to first 100, could create duplicate IDs. Check if dups exist, and if so, include more BPs. #Only if there were dups in the 100 character subset, create new SNPtest with minimum number of BPs until there are no dups. if(dat[duplicated(SNPtest), .N] > 0){ result<-sapply(seq(200,2000,by=100), function(x) { dat[, SNPtest := substr(SNP,1,x) ] dat[duplicated(SNPtest) , .N ] }) if(sum(result==0)>0){ myStop<-min(seq(200,2000,by=100)[ which(result==0)]) dat[ ,SNPtest:=substr(SNP,1,myStop) ] } else print(paste0("Duplicate SNP IDs after 2000 bp in chr ",myChr)) } #set keys to order by SNP order setkey(dat,SNPorder) setkey(map,SNPorder) #cbind, single column of dat is a vector map[ , SNPnew := dat[,SNPtest] ] #this also works, subset is a data.table, but is easily added #map[ , SNPnew := dat2[,.(SNPtest)] ] map <- map[ ,.(chr,SNPnew,gen,pos) ] myNumCol <- ncol(dat) dat <- dat[ ,c(myNumCol, 2:(myNumCol-2) ) , with=FALSE] fwrite(map, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map"),sep="\t",col.names=FALSE, verbose=TRUE ) fwrite(dat, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),sep="\t",col.names=FALSE, verbose=TRUE )
Job submission script.
1 2 3 4 5 6 7 8 9 10 | #!/bin/bash #PBS -l mem=200gb #PBS -l walltime=48:00:00 #PBS -q highmem #PBS -j oe cd $PBS_O_WORKDIR Rscript --vanilla ./uniqueSNPchr.R |
Even after that, there were InDel dups for 3 chromosomes: 6, 14, and 19. First InDel of the dups was retained.
library(data.table,lib.loc="/rhome/devans/Rlibs") myChr<-6L #myChr<-14L #myChr<-19L map<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map")) setnames(map, c("chr","SNP","gen","pos") ) map[,SNPorder := 1:map[,.N] ] #cbind info to map, remove duplicate from map. Then, use SNPorder to subset dose file. info<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr",myChr,".info") ) names(info)[1] <- "SNPinfo" map2<-cbind(map,info) #identical(map2[,SNPorder], 1:902461) map2<-map2[!duplicated(SNP)] myRows<-map2[,c(SNPorder)] #rows with non-duplicated SNPIDs #myDup<-map2[duplicated(SNP),c(SNP) ] #map2[SNP %in% myDup ,which=TRUE] which argument returns the matching row numbers #map2[SNP %in% myDup ,] #myDup2<- base::strsplit(myDup,"[",fixed=TRUE )[[1]][1] doseHeader<-scan("~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/headerDosage.txt",sep="\t",what="character") dat<-fread(paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),header=F,col.names=doseHeader) dat<-dat[myRows] fwrite(map2, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.Info.map"),sep="\t",col.names=FALSE, verbose=TRUE ) map2 <- map2[ ,.(chr,SNP,gen,pos) ] fwrite(map2, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.map"),sep="\t",col.names=FALSE, verbose=TRUE ) fwrite(dat, file.path=paste0("~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr",myChr,".plink.dose"),sep="\t",col.names=FALSE, verbose=TRUE )
Birth cohort and gender-specific survival percentiles from the US Social Security Administration are provided in Table 7 here. The age at the Xth survival percentile is the youngest age at which lx <= (1-X)*100,000. For the 90th survival percentile, lx <= 0.1*100,000 = 10,000.
SSA data is given by decade. A linear predictor was used to impute age at each year.
dat<-read.csv("~/Documents/longevity/1KGlongevity/ages.csv",header=T,stringsAsFactors=F) dat$birthYear<-dat$birthYear - 1900 myPredictor<-function(x){ lm1<-lm(x~birthYear,data=dat) newdata<-with(dat,expand.grid( birthYear = seq(min(birthYear),max(birthYear),by=1 ), age = 0 )) newdata$age <- round(predict(lm1,newdata)) return(newdata$age) } result<-sapply(dat[,2:9],myPredictor) myBirthYear = seq(min(dat$birthYear),max(dat$birthYear),by=1 )+1900 result<-cbind(myBirthYear,result) write.csv(result,file="~/Documents/longevity/1KGlongevity/agesPredict.csv",quote=F,row.names=F)
efDat<-read.csv("M:/work/mrOS/dataPheno/endpoints/efaug15.csv",header=T,stringsAsFactors=F) V1dat<-read.csv("M:/work/mrOS/dataPheno/V1FEB14.csv",header=T,stringsAsFactors=F) V1dat<-V1dat[,c("ID","SITE","GIAGE1")] ageLUT<-read.csv("M:/work/longevity/chargeAging/1KGlongevity/AP/lifetables/agesPredict.csv",header=T,stringsAsFactors=F) fam<-read.table("M:/work/longevity/chargeAging/1KGlongevity/MrOS/data/chr21.plink.fam",header=F,stringsAsFactors = F) names(fam)<-c("FID","IID","father","mother","sex","pheno") dat<-merge(efDat,V1dat,by="ID") dat$EFDATE<-as.Date(dat$EFDATE, format="%m/%d/%Y") dat$cohort<-as.numeric(format(dat$EFDATE,"%Y")) - dat$GIAGE1 dat$finalAge<-dat$GIAGE1 + dat$FUCYTIME #provides age cut-off based on birthyear cohort dat$male90cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male90"] dat$male99cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male99"] dat$male60cut<-ageLUT[match(dat$cohort,ageLUT$myBirthYear),"male60"] #no missings, meaning there were matches for all birth cohorts. dat$cases90<-0 dat$cases90[dat$finalAge>=dat$male90cut]<-1 dat$cases99<-0 dat$cases99[dat$finalAge>=dat$male99cut]<-1 dat$control60dead<-0 dat$control60dead[dat$EFSTATUS==1 & dat$finalAge<=dat$male60cut] <-1 dat$control60alive<-0 dat$control60alive[dat$EFSTATUS!=1 & dat$finalAge<=dat$male60cut] <-1 dat$control60DA<-0 dat$control60DA[dat$finalAge<=dat$male60cut] <-1 #output data for all MrOS subjects, even those without GWAS write.csv(dat,file="M:/work/longevity/chargeAging/1KGlongevity/MrOS/data/MrOSsurvivalPercentiles.csv",row.names=F,quote=F)
Run analysis using PLINK
1 2 3 4 5 6 7 8 9 10 11 12 13 | #!/bin/bash #PBS -t 1-22 #PBS -l mem=14gb #PBS -l walltime=10:00:00 #PBS -j oe module load plink cd ~/bigdata/U24/CHARGElongevity/dead90/scripts plink --memory 10000 --pheno ../data/dead90.txt --covar ../data/dead90cov.txt --allow-no-sex --fam ~/bigdata/U24/GWAScohorts/MrOS/vcf1KG/chr${PBS_ARRAYID}.plink.fam --map ~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr${PBS_ARRAYID}.plink.map --dosage ~/bigdata/U24/GWAScohorts/MrOS/plink1KG/chr${PBS_ARRAYID}.plink.dose.gz Zout noheader format=1 --out ../results/dead90chr${PBS_ARRAYID} module unload plink |
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.