##### sos.url<-"http://people.stat.sc.edu/hoyen/Stat588/Data/sosData.rds" sos<-readRDS(gzcon(url(sos.url))) sos[1:5,1:5] pop<-table(colnames(sos)) pop npop<-length(pop) ##### Are there missing values in the data? ina<-apply(sos, 1, function(x) sum(is.na(x))) which(ina>0) ##### calculate allele frequency for each SNP in each population afreq<-function(genotype){ ans<-sum(genotype)/(length(genotype)*2) return(ans) } M<-matrix(NA, nrow=nrow(sos), ncol=npop) #### calculate allele frequencies #### Take this out M[,1]<-apply(sos[,which(colnames(sos)==names(pop)[1])], 1, afreq) M[,2]<-apply(sos[,which(colnames(sos)==names(pop)[2])], 1, afreq) M[,3]<-apply(sos[,which(colnames(sos)==names(pop)[3])], 1, afreq) colnames(M)<-names(pop) #### calculate Fst fst<-matrix(NA, nrow=nrow(M), ncol=npop) colnames(fst)<-names(pop) meanp<-rowMeans(M) Dev<-M-rowMeans(M) tvar<-numeric() ######## Exercise: calculate allele frequencies and Fst. meanp<-rowMeans(M) Dev<-M-rowMeans(M) tvar<-numeric() fst<-matrix(NA, nrow=nrow(M), ncol=npop) for(i in 1:nrow(M)){ tvar<-meanp[i]*(1-meanp[i]) fst[i,]<-Dev[i,]^2/tvar } colnames(fst)<-names(pop) ######################## x=c(1:nrow(fst)) plot(x,fst[,3]) smoothed<-runmed(fst[,3], k=101, endrule="constant") lines(smoothed, lwd=3, col=2) fstm<-mean(smoothed) fstsd<-sd(smoothed) sig<-fstm+3*fstsd abline(h=sig, col="grey", lwd=3) ##### calculate LD library(snpStats) ##### snpStats requires data coding as: 1, 2, 3 (0 missing) M<-new("SnpMatrix", t(sos)+1) colSum<-col.summary(M[1:25,]) colSum[25:30,] ldHan<-ld(M[1:25,], depth=ncol(M)-1, stats="D.prime") ldHan[1:4, 1:4] ldAll<-ld(M, depth=ncol(M)-1, stats="D.prime") cols<-colorRampPalette(c("yellow", "red"))(10) #### heatmap image(ldHan[1:1000, 1:1000], lwd=0, cuts=9, col.regions=cols, colorkey=T) ######## calculate allele sharing mean(abs(sos[,1]-sos[,2])) allshare<-matrix(NA, ncol(sos), ncol(sos)) colnames(allshare)<-colnames(sos) rownames(allshare)<-colnames(sos) ##### Exercise: Calculate allele sharing matrix for all pairs of samples in SOS data. for (i in 1:ncol(sos)){ hold<-abs(sos-sos[,i]) allshare[,i]<-colMeans(hold, na.rm=T) } allshare[1:5,1:5] #### heatmap of allele sharing heatmap(allshare, symm=T, col=gray.colors(16, start=0, end=1), RowSideColors=cols[as.numeric(pop)], ColSideColors=cols[as.numeric(pop)]) ##### try a different color heatmap(allshare, symm=T, col=terrain.colors(16, alpha=1), RowSideColors=cols[as.numeric(pop)], ColSideColors=cols[as.numeric(pop)]) legend("topleft", levels(pop), fil=cols[1:length(levels(pop))]) #### tree library("ape") tree<-as.phylo(hclust(as.dist(allshare), method="ward.D2")) pop<-as.factor(colnames(sos)) cols<-c("black", "blue", "red") plot(tree, type="cladogram", edge.color="gray", direction="r", cex=0.7, adj=0, tip.color=cols[as.numeric(pop)]) legend("topleft", levels(pop), fil=cols, cex=0.8) ################### PCA G<-allshare-mean(allshare) SVD<-svd(G) plot(SVD$v[,1], SVD$v[,2], cex.main=0.9, main="Principal components", pch=16, xlab="PC1", ylab="PC2", col=cols[pop]) legend("topleft", levels(pop), fil=cols[1:length(levels(pop))], cex=0.8) #### variances of components variances<-SVD$d^2/(ncol(G)-1) variances2<-round(variances*100/sum(variances),3) variances2[1:2]