knitr::opts_chunk$set(echo = TRUE)

Load the data.

library(airway)
data(airway)
airway

Normalize in CPM.

libSize <- colSums(assay(airway, "counts"))
assay(airway, "CPM") <- t(t(assay(airway, "counts")) / libSize) * 1E6

Identify genes on the Y chromosome.

Ygenes <- unique(names(which(table(seqnames(airway) == "Y")[, "TRUE"] > 0)))

Aggregate counts across all genes on the Y chromosome

yCPM <- colSums(assay(airway, "CPM")[Ygenes, ])
# Cluster on the pairwise distance between total Y-chromosome CPM.
hc <- hclust(dist(yCPM))
plot(hc)

K-mean clustering with k=2 (expected number of genders)

km <- kmeans(matrix(yCPM, ncol = 1), 2)

Test if the total Y-CPM of the two groups is significantly different

testDataFrame <- data.frame(yCPM=yCPM, kmean=km$cluster)
ttOut <- t.test(yCPM ~ kmean, testDataFrame)
if (ttOut$p.value < 1E-3) {
    # there are both genders
    # return the group with the highest km$centers as male
    # return the group with the highest km$centers as female
} else {
    # there is only one gender
    # we can only tell which gender if we're given an idea of what either "low" or "high" means
}


kevinrue/hancock documentation built on May 17, 2020, 7:55 a.m.