class_script.R
In faarseer/systems_biology: comparelists in vector

#################1st class_Rscript###############
'''
source("http://bioconductor.org/biocLite.R")
biocLite("made4")
browseVignettes("made4")
library("made4")
'''
#################################################

################2nd class_Rscript################
library(dplyr)
setwd("~/Desktop/systems biology/")
save.image("chapter1/myworkspace")
load("chapter1/myworkspace")
rm(list = ls())

mydata = read.table("chapter1/snps.txt", header = T, sep= "\t")
dim(mydata)
print(mydata[2,3])
names(mydata)
head(mydata)
tail(mydata)
mydata[1:5,1:2]
indices = c(1,7,10,2,4)
mydata$name[indices]


animals = read.table("chapter1/animals.txt", header = T, sep = "\t")
animals2 = read.table("chapter1/animals2.txt", header = T, sep = "\t", na.strings = "*")
names(animals2)
class(animals2$weight)
animals2$weight <- as.numeric(animals2$weight)
alldata = cbind(mydata, animals)
head(alldata)
alldata <- alldata %>% select(id, weight, allele1, allele2)
head(alldata)
class(alldata$weight)
alldata$weight <- as.numeric(alldata$weight)
class(alldata$weight)
alldata = alldata[order(alldata$weight, decreasing = T),]
summary(alldata)
str(alldata)
barplot(c(summary(alldata$allele1), summary(alldata$allele2)), main = "Allele counts", col = c(1,2,3,1,2,3))

#########################3rd class_script#####################
s_a1 <- summary(alldata$allele1)
s_a2 <- summary(alldata$allele2)
print(c(s_a1, s_a2))
s_a1 + s_a2
s_a1 - s_a2
s_a1 * s_a2
s_a1%*% s_a2

rbind(s_a1, s_a2)

barplot(s_a1, s_a2)

pooled = c(as.character(alldata$allele1), as.character(alldata$allele2))
pooled = summary(factor(pooled))

pooled = c(alldata$allele1, alldata$allele2)
as.factor(pooled)
pooled = factor(pooled, labels = c('-','A','B'))

barplot(pooled, main = 'Pooled allele counts', col = c(1,2,3))
legend("topleft", c('-','A','B'), fil = 1:3)

boxplot(alldata$weight, col = 'blue', main = "Boxplot if weights for all animals", ylim =c(200,500))

boxplot(alldata$weight~alldata$allele1, col = 2:4, main = "Boxplot of weights by allele class")
plot(density(alldata$weight, na.rm = T), main = "Density", col = "blue")
names(density(alldata$weight, na.rm = T))

plot(sort(alldata$weight), col = "blue", main = "Sorted weights",xlab = "animal", ylab = "weight")
lines(sort(alldata$weight), col = "red")
a <- alldata$weight[order(alldata$weight, decreasing = T)]
plot(a, col = "blue", main = "Sorted weights",xlab = "animal", ylab = "weight")
lines(a, col = "red")

write.table(alldata, "chapter1/alldata.txt", quote = F, row.names = F, sep = '\t')
###########################################################
#################2017_04_03#############################
sires = read.table("chapter2/siredata.txt", header = T, skip= 3, sep ='\t', na.string = "-")
prog = read.table("chapter2/progdata.txt", header = T, skip = 3, sep = '\t', na.strings = "-")
summary(sires)
str(sires)
summary(prog)
str(prog)
k = which(is.na(prog$weight))
prog = prog[-k,]
#prog$weight = as.numeric(as.character(prog$weight))
#str(prog)
plot(prog$weight, main="XY plot of weight by animal", xlab = "animal", ylab = "weight", col = "blue")
boxplot_weight = boxplot(prog$weight, ylab = "weight")
which(prog$weight == boxplot_weight$out)
prog = prog[-which(prog$weight > 400),]
str(prog)
index_m = grep("m", names(prog))
missing = numeric()
for(i in 1:length(index_m)){
    #print("i=",i)
    missing = c(missing,which(is.na(prog[,index_m[i]])))
    #print(missing)
}
print(missing)
missingU = unique(missing)
prog = prog[-missingU,]
summary(prog$m11)
boxplot(prog$weight ~ prog$sire, col = 1:length(levels(prog$sire)), main = "Boxplot of weights by sire")
boxplot(prog$weight ~ prog$sex, col = 1:length(levels(prog$sex)), main = "Boxplot of weights by sex")
plot(density(prog$weight), col = "blue", main = "Density plot of weights")
plot(prog$weight, col = prog$sex, pch = as.numeric(prog$sex), main = "XY plot of weight by animal", xlab = "animal", ylab = "weight")
legend("topleft", levels(prog$sex), col = 1:2, pch = 1:2)
###############################################################
#homework; p47 _cat #write reference
#compare genotypes of the sires with the offsprings.
index_m_sire = grep("m",names(sires))
index_m_sire_mat = matrix(index_m_sire, length(index_m_sire)/2,2,byrow = T)
index_m_prog = grep("m", names(prog))
index_m_prog_mat = matrix(index_m_prog, length(index_m_prog)/2,2,byrow = T)
library(made4)
#library(devtools)
#install_github("faarseer/systems_biology")
#library(homework2)

compatible = matrix(NA, length(sires$id), length(index_m_sire_mat[,1]))
for (j in 1:length(index_m_sire_mat[,1])){
    for (i in 1:length(sires$id)){
        indexs = which(prog$sire == sires$id[i]) # list of indexs that matches 'i'st sire
        sire_alleles_b = sires[i,index_m_sire_mat[j,]] # alleles of 'i'st sire and 'j' marker, list type
        sire_alleles = c(as.character(sire_alleles_b[,1]), as.character(sire_alleles_b[,2])) # vector of 2 characterized alleles
        hold_b = prog[indexs, index_m_prog_mat[j,]] # alleles of 'i'st sire`s offsprings and 'j' marker, list type
        hold_a = factor(c(as.character(hold_b[,1]), as.character(hold_b[,2]))) # vector of 2 characterized alleles
        hold = sort(summary(hold_a), decreasing = T) # sort
        top_alleles = names(hold)[1:2] # top 2 alleles of hold
        print("top_alleles")
        print(top_alleles)
        print("sire_alleles")
        print(sire_alleles)
        compatible[i,j] = length(comparelists(sire_alleles, top_alleles)$Set.Diff) # sum of unique alleles in sire_alleles against top_alleles
        #print("=======================================")
        #print(length(comparelists(sire_alleles, top_alleles)$Set.Diff))
        if (i ==1 & j == 1){ # check at i == 1, j == 1
            cat("allele counts in offspring\n")
            print(hold)
            cat("most common alleles in offspring\n")
            print(top_alleles)
            cat("sire alleles\n")
            print(sire_alleles)
        }
    }
}
print(compatible) # cannot find diff

compatible = matrix(NA, length(prog$id), length(index_m_sire_mat[,1]))
for (j in 1:length(index_m_sire_mat[,1])){
    for (i in 1:length(sires$id)){
        indexs = which(prog$sire == sires$id[i]) # list of indexs that matches 'i'st sire
        sire_alleles_b = sires[i, index_m_sire_mat[j,]] # alleles of 'i'st sire and 'j' marker, list type
        sire_alleles = c(as.character(sire_alleles_b[,1]), as.character(sire_alleles_b[,2])) # vector of 2 characterized alleles
        for (k in 1:length(indexs)){
            hold = prog[indexs[k], index_m_prog_mat[j,]] # 'j'st alleles of k'st prog 
            prog_alleles = c(as.character(hold[,1]), as.character(hold[,2])) # vector of 2 characterized alleles
            compatible[indexs[k],j] = length(comparelists(sire_alleles, prog_alleles)$intersect) # sum of intersects of sire_alleles and prog_alleles
        }
    }
}
compatible_df = as.data.frame(compatible) # convert compatible to a data.frame that can support factor type elements.
for (i in 1:length(compatible_df[1,])){
    compatible_df[,i] = factor(as.character(compatible_df[,i]))
}
summary(compatible_df)
prog = prog[-which(compatible_df[,1] == 0),] # drop the incorrectly genotyped offspring data.
######################################################################
#########170405########################
alleles = summary(factor(c(as.character(prog$m11), as.character(prog$m12))))
print(alleles)
alleles = alleles/ sum(alleles)
hold = data.frame(m11 = as.character(prog$m11), m12 = as.character(prog$m12))
hold[,1] = as.character(hold[,1])
hold[,2] = as.character(hold[,2])
sorted = character()
for (i in 1:length(hold[,1])){
    sorted = rbind(sorted, as.character(hold[i,]))
}
genotypes = paste(as.character(sorted[,1]), as.character(sorted[,2]), sep = '_')
genotypes = summary(factor(genotypes))
print(genotypes)
genotypes = genotypes/sum(genotypes)

split.screen(c(2,1))
screen(1)
barplot(sort(alleles, decreasing = T), col = 1:11, main = "Barplot of  alleleic frequencies")
screen(2)
barplot(sort(genotypes, decreasing = T), col = 1:11, main = "Barplot of genotypic frequencies")
close.screen(all = T)
#dev.print(file = "animgeno.pdf", device = pdf, width =8, height = 8)
################################################
############170410############################
allgeno = NULL
for (i in 1:length(index_m_prog_mat[,1])){
    hold = data.frame(prog[index_m_prog_mat[i,]])
    hold[,1]= as.character(hold[,1])
    hold[,2]=as.character(hold[,2])
    sorted = character()
    for (j in 1:length(hold[,1])){
        sorted = rbind(sorted, sort(as.character(hold[j,])))
    }
    genotypes = paste(as.character(sorted[,1]),as.character(sorted[,2]), sep = "_")
    allgeno = cbind(allgeno, genotypes)
}
colnames(allgeno) = c("M1","M2","M3","M4","M5")
markers = data.frame(prog[1:4], allgeno)
head(markers)
write.table(markers, "chapter2/cleandata.txt", quote = F, sep = "\t", row.names = F)
#########################################
###########170412########################
print(sires$id)
results = lm(weight ~ M1, data = markers)
summary(results)
plot(predict(results), residuals(results), main = "XY plot of residuals X fitted values", xlab= "fitted values(weight)", ylab = "residuals", col = "blue")
split.screen(c(2,1))
screen(1)
qqnorm(predict(results), col = "blue")
screen(2)
qqnorm(residuals(results), col = "blue", main = "")
close.screen(all = T)
fligner.test(weight~M1, data = markers)
shapiro.test(prog$weight)
anova(results)
summary(lm(weight~sex+M1, data = markers))
summary(lm(weight~sex+sire+M1, data = markers))
model1 = lm(weight~sex+sire+M1, data = markers)
model2 = lm(weight~sex+M1, data = markers)
model3 = lm(weight~M1, data = markers)
anova(model1, model2)
anova(model2, model3)
model4 = lm(weight~1, data = markers)
anova(model3,model4)
anova(model2,model4)
summary(lm(weight ~sex +M1, data = markers[which(prog$sire == "sire1"),]))
summary(lm(weight~ sex+sire+M2, data = markers))
summary(lm(weight~ sex+sire+M3, data = markers))
summary(lm(weight~ sex+sire+M4, data = markers))
summary(lm(weight~ sex+sire+M5, data = markers))
plot.design(weight~sex+sire+M5, data = markers, col = "blue")
for (i in grep("M", colnames(markers))){
    a = anova(lm(weight~sex+sire+markers[,i],data = markers))
    print(colnames(markers)[i])
    print(a)
}
##############################################################
library(car)
Anova(lm(weight~sex+sire+M1+M2+M3+M4+M5, data = markers), type = 3)
library(nlme)
linear_b = coefficients(lm(weight~sex+sire+M5,data = markers))[c(1,2,12:28)]
random_b = fixef(lme(weight~sex+M5, random=~1/sire, data = markers))
linear = data.frame(effect = names(linear_b), fixed = linear_b)
random = data.frame(effect = names(random_b), fixed = linear)
comparison_b = merge(linear, random, by = "effect")
comparison = data.frame(comparsion_b, difference = comparsion_b$fixed - comparsion_b$random)
print(comparsion)
#################070419################################
p = seq(0.1, length.out = 10000)
length(p)
head(p)
logP_obs <- -log10(p[2:10000]) +4
logP_exp <- -log10(p[2:10000]) +4
head(logP_exp)
plot(logP_exp,logP_obs, xlim = c(0,6), ylim = c(0,6))
abline(h = -log10(0.05/10000), col = "blue")
FDRp = c()
for(i in 1:9999){
    hold = 0.05 * i / 9999
    FDRp = append(FDRp, hold)
}
length(FDRp)
head(FDRp)
abline(lm(-log10(FDRp) ~ logP_exp), col = "red")

a = c("AA","AB","BB")
geno = rep(a, each = 500)
weight = c(rnorm(500,300,30),rnorm(500,305,30),rnorm(500,310,30))
anova(lm(weight ~ geno))
anova(lm(weight ~ geno))$F
actual <- anova(lm(weight ~ geno))$F[1]
perm <- c()
for(i in 1:10000){
    weight_perm <- sample(weight)
    hold = anova(lm(weight_perm ~ geno))$F[1]
    perm = append(perm,hold)
}
plot(density(perm))
abline(v = actual, col = "red")
plot(density(perm), xlim = c(0,20))
abline(v = actual, col = "red")
################170424##########################
#install.packages("RSQLite")
library(RSQLite)
#readLines("chapter3/SNPsample.txt", n = 15)
#readLines("chapter3/SNPmap.txt", n = 15)
con = dbConnect(dbDriver("SQLite"), dbname = "chapter3/SNP_sew")
dbWriteTable(con, "snpmap", "chapter3/SNPmap.txt", header = T, sep = "\t")
dbWriteTable(con, "SNP", "chapter3/SNPsample.txt", header = T, skip = 9, sep = "\t")
dbListTables(con)
dbListFields(con,"snpmap")
dbListFields(con,"SNP")
dbGetQuery(con, "select count(*) from snpmap")
dbGetQuery(con, "select count(*) from SNP")
dbGetQuery(con, "select count(*) from snpmap") * 83
animids = dbGetQuery(con, "select distinct animal from SNP")
dim(animids)
head(animids)
#always load data as character type
animids = as.vector(animids$animal)
hold = dbGetQuery(con, paste("select * from SNP where animal = '",animids[1],"'", sep =  ""))
dim(hold)
head(hold)
##
paste("select * from SNP where animals = '",animids[1],"'",sep="")
snpids = as.vector(dbGetQuery(con, "select distinct name from snpmap")[,1])
length(snpids)
head(snpids)
dbGetQuery(con, "CREATE INDEX chromosome_idx ON snpmap(chromosome)")
dbGetQuery(con, "CREATE INDEX snp_idx ON SNP(animal)")
dbGetQuery(con, "CREATE INDEX ID_idx ON SNP(snp)")
snpids = as.vector(dbGetQuery(con, "select distinct name from snpmap") [,1])
dbDisconnect(con)
###################################170501######################
library(RSQLite)
con = dbConnect(dbDriver("SQLite"), dbname = "chapter3/SNP_sew")
snp = dbGetQuery(con,paste("select * from SNP where snp = '", snpids[1],"'",sep=""))
dim(snp)
head(snp)
snp$allele1 = factor(snp$allele1)
snp$allele2 = factor(snp$allele2)
summary(snp$allele1)
summary(snp$allele2)
#snp = data.frame(snp, genotype = factor(paste(snp$allele1,snp$allele2,sep =""), levels = c("AA","AB","BB")))
#plot(snp$x, snp$y, col = snp$genotype, pch = as.numeric(snp$genotype),main = snp$snp[1])

#genotype <- paste(snp$allele1, snp$allele2, snp = "")
#head(genotype)
snp <- data.frame(snp,genotype = factor(paste(snp$allele1, snp$allele2, sep = ""),levels = c("AA","AB","BB")))
str(snp)
summary(snp)

plot(snp$x, snp$y, col = snp$genotype, pch = as.numeric(snp$genotype), main = snp$snp[1])
legend("bottomleft", levels(snp$genotype),col = 1:3, pch=1:3)

alleles = factor(c(as.character(snp$allele1), as.character(snp$allele2)), levels = c("A","B"))
summary(alleles) / sum(summary(alleles)) * 100
obs = summary(factor(snp$genotype, levels = c("AA","AB","BB")))
hwal = summary(factor(c(as.character(snp$allele1), as.character(snp$allele2)), levels = c("A","B")))
hwal = hwal / sum(hwal)
exp = c(hwal[1]^2,2*hwal[1]*hwal[2],hwal[2]^2 )*sum(obs)
names(exp) = c("AA","AB","BB")
xtot = sum((abs(obs-exp)-c(0.5,1.0,1.5))^2/exp)
pval = 1 - pchisq(xtot,2)
print(pval)
sumslides = matrix(NA,83,4)
rownames(sumslides) = animids
colnames(sumslides) = c("-/-","A/A","A/B","B/B")
numgeno = matrix(9,54977,83)
for(i in 1:83){
    hold = dbGetQuery(con, paste("select * from SNP where animal = '",animids[i],"'",sep = ""))
    hold = data.frame(hold,genotype = factor(paste(hold$allele1, hold$allele2, sep = ""), levels =c("--","AA","AB","BB")))
    hold = hold[order(hold$snp),]
    sumslides[i,] = summary(hold$genotype)
    temp  =hold$genotype
    levels(temp) = c(9,0,1,2)
    numgeno[,i] = as.numeric(as.character(temp))
    numgeno[(which(hold$gcscore < 0.6)), i] = 9
}
rownames(numgeno) = hold$snp
colnames(numgeno) = animids
dim(sumslides)
dim(numgeno)
##################170508###################################
numgeno[1:10, 1:3]
head(sumslides)
nrow(numgeno)
sample_hetero = sumslides[,3]/(sumslides[,2]+sumslides[,3]+sumslides[,4])
head(sample_hetero)
up = mean(sample_hetero) + 3*sd(sample_hetero)
down = mean(sample_hetero) - 3*sd(sample_hetero)
hsout = length(which(sample_hetero > up | sample_hetero < down))
plot(sort(sample_hetero), 1:83, col ="blue", cex.main=0.9, cex.aris = 0.8, cex.lab = 0.8, ylab = "sample", xlab = "heterozygosity",
     main = paste("Sample heterozygosity \n mean : ", round(mean(sample_hetero),3), "    sd:", round(sd(sample_hetero),3)),
     sub = paste("mean: black line    ",3,"SD: red line    number of outliers:",hsout), cex.sub = 0.8)
abline(v = mean(sample_hetero)) # v means vertical.
abline(v = up, col= "red")
abline(v = down, col = "red")
dev.print(file = "sample_hetero.pdf", device = pdf, width = 6, height = 6)

##heatmap
animcor = cor(numgeno)
numgeno[1:4,1:10]
animcor[1:4,1:10]
library(gplots)
hmcol = greenred(256)
heatmap(animcor, col = hmcol, symm = T)

ma = matrix(c(1,2,3,100,200,300,400,500,600), nrow = 3, byrow = T)
cor(ma)
ma
heatmap(cor(ma), col = hmcol, symm= T)

#3.5
genotypes = read.table("chapter3/SNPxSample.txt", header = T, sep = "\t", na.strings = "9", colClasses = "factor")
dim(genotypes)
summary(genotypes)
head(genotypes)
for (i in 1:ncol(genotypes)){
    levels(genotypes[,i]) = c("AA","AB","BB", NA)
}

indexsnp = apply(genotypes,1,function(x) length(which(is.na(x) == T))) # margin == 1 indicates rows, 2 indicates cols.
indexsnp2 = which(indexsnp == 83)
length(indexsnp2)
genotypes = genotypes[-indexsnp2,]
weight = rnorm(83,mean = 50, sd = 10)
plot(density(weight), col = "blue", main = "Density plot of weights")
abline(v = mean(weight), col = "red")
lines(density(rnorm(83000, mean = 50, sd = 10)), col= "green", lty = 2)
singlesnp = function(trait, snp){
    if(length(levels(snp))>1){
        lm(trait ~snp)
    } else{ NA}
}

dim(genotypes)

results = apply(genotypes,1,function(x) singlesnp(weight, factor(t(x))))
results

###################################################################

pvalfunc = function(model){
    if(class(model) == "lm"){
        anova(model)[[5]][1]
    } else{ NA}
}

pvals = lapply(results, function(x) pvalfunc(x))
pvals

######170515###################################


## Example 1 ###
geno = readRDS("chapter4/genotypes.rds")
dim(geno)
freqA=numeric(10000)
freqB=numeric(10000)

for (i in length(1)){
    start = Sys.time()
    for (i in 1:10000) {  
        hold=0
        for (j in 1:2000) {
            hold=hold+geno[i,j]
        }
        freqB[i]=hold/4000
        freqA[i]=1-freqB[i]
    }
    end = Sys.time()
    print(end - start)
    }

## Example 2 ###
freqA=numeric(10000)
freqB=numeric(10000)
start = Sys.time()
for (i in 1:10000) {
    freqB[i]=sum(geno[i,])/4000
    freqA[i]=1-freqB[i]
}
end = Sys.time()
end - start


## Example 3 ###
start = Sys.time()
freqB=rowSums(geno)/4000
freqA=1-freqB
end = Sys.time()
end - start


## Example 4 ###
start = Sys.time()
freqB=apply(geno,1,function(x) sum(x))/4000
freqA=1-freqB
end = Sys.time()
end - start

pheno = rnorm(2000, mean = 100, sd = 10)
pvals = numeric(10000)
for (j in length(1)){
    start = Sys.time()
    for (i in 1:100){
        pvals[i] = coef(summary(lm(pheno~geno[i,])))[2,4]
    }
    end = Sys.time()
    print(end - start)
}

for (i in length(1)){
    pvals = apply(geno, 1, function(x) coef(summary(lm.fit(pheno ~ x)))[2,4])
}

A = matrix(c(1,2,3,4,3,2,1,5,3),3, byrow = T)
B = matrix(c(9,8,7,6,5,4,3,2,1),3, byrow = T)
I = diag(nrow = 3)
A
B
I
A+B
A+3
A*2
A*B # hadamard product
A %*% B
sum(A[1,]*B[,1])
t(A)
solve(A, I)
round(solve(A) %*% A, 3)

M = geno -1
M2 = M * geno
M2 = pheno[1:13]*M
Mt = t(M)
dim(M)
dim(Mt)
MtM = Mt %*% M
MtM[1,1]
sum(Mt[1,] * M[,1])
MtM = M %*% Mt
dim(MtM)

MtM = crossprod(M) # same as t(M) %*% M

A = matrix(c(3,-4,1,5),2)
y = c(3,7)
solve(A) %*% y
A = martix(c(1,2,3,-1,2,1),3,2)
y = c(2,16,18)

B = t(A) %*% A
solve(B) %*% t(A) %*% y

#4.6.1

sos = readRDS("chapter4/sosData.rds")
str(sos)
summary(factor(colnames(sos)))
sos[1:5,1:5]

M = matrix(NA, nrow(sos),3)
colnames(M) = c("hanwoo", "angus", "brahman")
M[,1] = apply(sos[,which(colnames(sos) == "hanwoo")],1,function(x) sum(x)/50)
M[,2] = apply(sos[,which(colnames(sos) == "angus")],1,function(x) sum(x)/50)
#M[,3] = apply(sos[,which(colnames(sos) == "brahman")],1,function(x) sum(x)/(length(x)*2))
M[,3] = rowSums(sos[,which(colnames(sos) == "brahman")])/50
head(M)
meansB = rowMeans(M)
alleleVar = meansB *(1 - meansB)
meansDevB = M - meansB
FST = meansDevB^2 / alleleVar
head(FST)


aa = sort(FST[,1])
length(aa)
length(FST[,1])
plot(FST[,1], type = "l",xlab = "SNP", ylab = "Fst", col = "gray")
smoothred = runmed(FST[,1],k = 101, endrule = "constant")
#runmed outputs median of data, set of number of k, endrule .. if the set of datas over original data,, 1 자리로순환.
lines(smoothred, type = "l", col = "red",lwd = 6)
FSTm = mean(smoothred)
FSTsd = sd(smoothred)
sig = FSTm + 3* FSTsd
abline(h = sig)
    print(sig)
lines(which(smoothred > sig),smoothred[which(smoothred > sig)], type = "l", col = "green",lwd = 6)
#4.6.2 skip
#4.6.3
#source("http://bioconductor.org/biocLite.R")
#biocLite("StAMPP")
library(StAMPP)
sos[1:3,1:3]
M = t(sos)
M[which(M ==0)] = "AA"
M[which(M ==1)] = "AB"
M[which(M ==2)] = "BB"
M[1:3,1:3]

M= cbind(sample = 1:nrow(M), pop = rownames(M), ploidy = 2, format = "BiA", M)

M[1:5,1:7]
M = as.data.frame(M)
M[1:30, 1:7]
M = stamppConvert(M, type = "r")
M[1:6,1:7]

FST = stamppFst(M, nboots = 200, percent = 95)
FST$Fsts

#############################170522#######################

names(FST)
dim(FST$Bootstraps)
FST$Bootstraps[,1:6]
FST$Bootstraps[,202:206]
#biocLite("snpStats")
library(snpStats)

M = new("SnpMatrix", t(sos+1))
ldHan = ld(M[1:25,], depth = 1000, stats = "D.prime")
ldHan[1:3,1:3]
ldAll = ld(M, depth = 1000, stats = "D.prime")
dim(ldHan)
cols = colorRampPalette(c("yellow", "red"))(11)
image(ldHan[1:1000,1:1000], lwd = 0, cuts = 10, col.regions = cols, colorkey =T)
image(ldAll[1:1000,1:1000], lwd = 0, cuts = 10, col.regions = cols, colorkey =T)
#biocLite("ape")
library(ape)
M = sos #책에빠졋대.
M[1:3,1:3]
dim(M)


#allele sharing
allshare = matrix(NA, ncol(M), ncol(M))
rownames(allshare) = paste(colnames(M),"_",1:ncol(M), sep = "")
colnames(allshare) = paste(colnames(M),"_",1:ncol(M), sep = "")
for(i in 1:ncol(M)){
    hold = abs(M- M[,i])
    allshare[,i] = colMeans(hold, na.rm =T)
}
allshare[1:5,1:5]
pop = as.character(colnames(M))
cols = c("black","blue","red")
heatmap(allshare, symm = T, col = gray.colors(32,start=0, end = 1), RowSideColors = cols[as.numeric(as.factor(pop))], 
        ColSideColors = cols[as.numeric(as.factor(pop))])
legend("topleft", levels(pop), legend = levels(as.factor(pop)),fill = cols, cex = 0.8)

dim(M)
M = sos
M = M-1
p = rowMeans(M, na.rm = T)/2
plot(density(p))
P = (p-0.5)*2
plot(density(P))
Z = M-P
Z[1:5,1:5]
ZtZ = t(Z) %*% Z
dim(ZtZ)
ZtZ[1:5,1:5]
d = 2*sum(p*(1-p))
G = ZtZ /d
rownames(G) = pop
colnames(G) = pop
heatmap(G, symm = T, col = gray.colors(16,start=0, end = 1), RowSideColors = cols[as.numeric(as.factor(pop))], 
        ColSideColors = cols[as.numeric(as.factor(pop))])
legend("topleft", levels(as.factor(pop)), fill = cols[1:length(levels(as.factor(pop)))])
save.image("~/Desktop/systems biology/170522.Rdata")
#####################170524###############################################
z = matrix(1:12,4)
z
colnames(z) = c("s1","s2","s3")
g = matrix(NA, 3,3)
colnames(g) = c("s1","s2","s3")
colnames(g) = c("s1","s2","s3")
g
for(j in 1:3){
    for(k in 1:3){
        g[j,k] = sum(z[,j] * z[,k])
    }
}
g
t(z) %*% z
#SVD ; singular value decomposition (of the GRM Matrix)
#SVD is the main algorithm behind principal component analysis(PCA)
#skip SVD
#4.7 parentage testing
#SNP opposing homozygote between F0 F1 can be reliable but not in like..F0 F2
#two allele share one allele share zero allele share
#half-sibs can be shared quatile.
M = readRDS("chapter4/parData.rds")
dim(M) # 20000 snps 312 individuals.
ped = read.table("chapter4/pedigree.txt", header = T, sep = "\t")
str(ped)
dim(M)
M[1:5,1:5]
#sire father id
#dam mother id
#group which family
#0 =sire, 1=dam, 2= full-sibs, 3= half-sibs.
summary(ped)
#OP matrix
AA_homo = matrix(0, nrow(M), ncol(M))
BB_homo = matrix(0, nrow(M), ncol(M))
AA_homo[M==0] = 1
BB_homo[M==2] = 1
op = t(AA_homo) %*% BB_homo
dim(op)
op[1:5,1:5]
AA_homo[1:5,1:5]
#row for AA, col for BB, if [1,2] means sum of 1st sample == AA && 2nd sample == BB
length(which(M[,1] ==0 & M[,2]== 2))
op = t(op) + op #what we want is the sum of M[,1] ==0 & M[,2]==2 | M[,1]==2 & M[,2]==0
op[1:5,1:5]
dim(op)
rm(AA_homo, BB_homo)
groups = ped$sire
groups
groups[1:12] = "sire"
groups[13:120] = "dam"
groups[121:312] = paste("fam", groups[121:312])
groups = as.factor(groups)
library(made4)
cols = getcol(length(levels(groups)))
heatmap(op,symm = T, col = gray.colors(32, start=0,end = 1), RowSideColors = cols[as.numeric(groups)], 
        ColSideColors = cols[as.numeric(groups)])
legend("topleft", levels(groups), fil = cols[1:14])

#upper triangular matrix of op
plot(sort(op[upper.tri(op)]), ylab = "no of op", pch = 20)

#########170531#####################
library(affy)
filenames = c(paste("ctrl",1:5, ".CEL", sep = ""),paste("treat",1:5, ".CEL", sep = ""))
Names = c(paste("C",1:5,sep = ""),paste("T", 1:5, sep = ""))
slides = ReadAffy(filenames = paste("chapter5/", filenames, sep = ""), sampleNames = Names)
print(slides)
dim(pm(slides))
#biocLite("affyPLM")
library(affyPLM)
PLM = fitPLM(slides)
par(mfrow = c(2,2))
image(slides[,1],main ="log raw intensities")
image(PLM, type = "weights",which = 1, main = "weights", add = T)
image(PLM, type = "resids",which = 1, main = "residuals")
image(PLM, type = "sign.resids",which = 1, main = "sign of residuals")

##############170605#########################
#load("lecture_170531.Rdata")
treatcol = c(3,3,3,3,3,2,2,2,2,2)
Mbox(PLM, col = treatcol)
Median = apply(coefs(PLM),1,function(x) median(x))
#fitPLM : RMA model => log2 scale
boxplot(coefs(PLM) - Median, col = treatcol)
#median 거리먼것들이 안좋은..것임.
par(mfrow = c(2,1))
boxplot(slides, col = treatcol)
hist(slides, col = treatcol, lty =1)
library(ABarray)
cor = pm(slides)
cor = cor(cor)
matrixPlot(cor, nrgcols = 21, k = 21)

#preprocessing
library(affy)
RMA = rma(slides)
RMA = exprs(RMA)
par(mfrow = c(2,1))
boxplot(RMA,col = treatcol, main = "Boxplot of normalized log intensities", show.names = F)
plot(density(RMA[,1]), col = treatcol[1], main = "Histogram of normalied log intensities",
     xlm = c(min(RMA), max(RMA)), xlab = "log2 intensity")
for(j in 2:length(treatcol)){
    lines(density(RMA[,j]), col = treatcol[j])
}
#background noise 제거완료

PCA = princomp(RMA)
PCA = loadings(PCA)[,1:2]
plot(PCA, col = treatcol, main = "principal components plot", pch = treatcol, xlab = "PCA1", ylab = "PCA2")
PCA = princomp(RMA - rowMeans(RMA))
PCA = loadings(PCA)[,1:2]
plot(PCA, col = treatcol, main = "principal components plot", pch = treatcol, xlab = "PCA1", ylab = "PCA2")

which(PCA[,1] > -0.31)

#####################170607#############################
###5.5 analysis of differential expression

library(limma)
#to use limma , you need design matrix, contrasts matrix...

design = cbind(c(rep(1,5), rep(0,5)), c(rep(0,5),rep(1,5)))
colnames(design) = c("ctrl","treat")
design

contrasts = matrix(c(1,-1), byrow = F)
colnames(contrasts) = "ctrl-treat"
rownames(contrasts) = c("ctrl", "treat")
contrasts

RMA[1:2,1:5]
rowMeans(RMA[1:2,1:5])
fit$coefficients[1:2,]

fit = lmFit(RMA,design)
head(fit$coefficients)
fitc = contrasts.fit(fit, contrasts) #ctrl - treat ; positive means more gene expression in ctrl
head(fitc$coefficients)
fitb = eBayes(fitc) # std 
head(fitb$t)
head(fitb$p.value)
#M = log2(PM_ij/PM_kj) (i,k = group, j = probe), A = 1/2(log2(PM_ij) +log2(PM_kj)) = log2(sqrt(PM_ij* PM_kj))
head(fitc$Amean) # median of fit$coefficients
head(fit$coefficients)
#MA plot
smoothScatter(fitb$Amean, fitb$coefficients, nrpoints = 500, xlab = "A", ylab = "M", cex.main = 0.9)
abline(h =1); abline(h = -1)
#volcano plot
lod = -log10(fitb$p.value)
o1 = which(fitb$coefficients > 1 | fitb$coefficients < -1)
o2 = which(fitb$p.value <0.01)
o = intersect(o1,o2)
plot(fitb$coefficients, lod, xlab = "difference in expression(M)", ylab = "t-statistic p-values(-log10)", cex.main = 0.95, 
     cex = 0.25, main = "volcano plot")
abline(h = 2) #abline(h = -log10(0.01))
abline(v = 1)
abline(v = -1)
points(fitb$coefficients[o1], lod[o1], pch = 19, col = "blue")
points(fitb$coefficients[o2], lod[o2],pch = 19, col = "red")
points(fitb$coefficients[o2], lod[o2],pch = 19, col = "green")
#green points are significantly efficient
res = data.frame(Foldchange = fitb$coefficients, p.value = fitb$p.value, Amean = fitb$Amean)
names(res) = c("Foldchange", "p.value", "Avalue")
res = res[order(res$p.value),]
head(res)
length(which(res$p.value < 0.01))
length(which(res$p.value < 0.01 & res$Avalue > 8))