R/subsetSamples.r

Defines functions subsetSamples

# Use the outputs of the <map2gene> function as inputs to get a subset of genotype matrix including just the specified subset of samples
# (Optionally) filter variants that have redundant IDs, or have no valid genotype calls
subsetSamples<-function(gt, sample.id, filter.variants=TRUE, remove.monosnp=TRUE) {
  # gt    Outputs from the <map2gene> function, including 4 elements: 
            # 'genotype': genotype call matrix
            # 'annotation': GRanges object with variant annotation
            # 'allele': all alleles of each variant
            # 'map2gene': mapping information related to genes, with 6 elements generated by the <map2gene> function
                # 'summary': variant-to-region summary table, rows match variants one-to-one
                # 'location': details of variant-to-region mapping
                # 'mutation': details of variant-to-mutation type mapping
                # 'gene2variant': list of gene-to-variant mapping
                # 'variant2gene': list of variant-to-gene mapping
                # 'alternative.code': table of recoded calls, match the 'allele' table, see <recodeCalls> functio for details
  # sample.id         IDs of samples to be included in the outputs
  # filter.variants   If TRUE, filter variants to remove those with duplicated ID (keep ones with highest number of valid calls) and those having no valid calls
  # remove.monosnp    If TRUE, remove monomorphic SNPs
  
  g<-gt$genotype;
  
  sample.id<-unique(sample.id[sample.id %in% colnames(g)]);
  if (length(sample.id) > 0) {
    if (!filter.variants) gt[[1]]<-g[, sample.id] else { # only change the genotype matrix
      g<-g[, sample.id, drop=FALSE];
      
      # flag of filtering
      keep<-rep(TRUE, nrow(g));
      names(keep)<-rownames(g); 
      
      # valid calls per variant
      n<-apply(g, 1, function(g) length(g[!is.na(g) & g>-1]));
      keep[n==0]<-FALSE; # remove variants with no valid calls
      
      # remove duplicated IDs
      id<-rownames(g);
      id.dup<-unique(id[duplicated(id)]);
      if (length(id.dup) > 0) {
        # index of duplicated ID to be removed
        ind.out<-lapply(id.dup, function(i) {
          ind<-which(id == i);
          ct<-n[ind];
          ind[order(ct)][-length(ct)];
        });
        keep[unique(unlist(ind.out))]<-FALSE;
      }
      
      if (remove.monosnp) {
        ct<-apply(g, 1, function(g) {
          x<-unique(g);
          length(x[!is.na(x) & x>-1]);
        });
        keep[ct<=1]<-FALSE;
      }
      
      # variant IDs to be kept
      keep.id<-names(keep[keep]);
      
      gt[[1]]<-g[keep, , drop=FALSE];
      gt[[2]]<-gt[[2]][keep];
      gt[[3]]<-gt[[3]][keep, , drop=FALSE];
      
      if (!is.null(gt$map2gene)) {
        anno<-gt[[4]];
        anno[[1]]<-anno[[1]][keep, , drop=FALSE];
        anno[[2]]<-anno[[2]][as.vector(anno[[2]]$varID) %in% keep.id, , drop=FALSE];
        anno[[3]]<-anno[[3]][as.vector(anno[[3]]$varID) %in% keep.id, , drop=FALSE];
        anno[[5]]<-anno[[5]][names(anno[[5]]) %in% keep.id];
        anno[[6]]<-anno[[6]][keep, , drop=FALSE];
        
        gn<-rep(names(anno[[4]]), sapply(anno[[4]], length));
        v<-unlist(anno[[4]], use.names=FALSE);
        anno[[4]]<-split(v[v %in% keep.id], gn[v %in% keep.id]);
        
        gt[[4]]<-anno;
      }      
    }    
  }
  
  gt;
}
zhezhangsh/GtUtility documentation built on May 4, 2019, 11:20 p.m.