#################################################
# C02: 群集分析                                 #
# 吳漢銘 國立政治大學統計學系                   #
# http://www.hmwu.idv.tw/                       #
#################################################

#  12/122

iris.scaled <- scale(iris[, -5])
iris.km <- kmeans(iris.scaled,  centers=3)
fviz_cluster(iris.km, iris[, -5])
fviz_cluster(iris.km, iris[, -5], ellipse.type = "norm")


#  25/122
cell.matrix  <- read.table("YeastCellCycle_alpha.txt", header=TRUE, row.names=1)
n <- dim(cell.matrix)[1]
p <- dim(cell.matrix)[2]-1
cell.data <- cell.matrix[,2:p+1]
gene.phase <- cell.matrix[,1]
phase <- unique(gene.phase)
phase.name <- c("G1", "S", "S/G2", "G2/M", "M/G1")
cell.sdata <- t(scale(t(cell.data))) 
no.group <- 5
no.iter <- 20
cell.kmeans <- kmeans(cell.sdata, no.group, no.iter)
plot(cell.sdata[,1:4], col = cell.kmeans$cluster)

## PCA
plot(pca.dim1, pca.dim2,  main="PCA for Cell Cycle Data", 
     xlab="PCA-1", ylab="PCA-2", col=gene.phase)

plot(pca.dim1, pca.dim2,  main="PCA for Cell Cycle Data with K-means Clustering", 
     xlab="PCA-1", ylab="PCA-2", col=cell.kmeans$cluster)

# 26/122

data(USArrests) 
pairs(USArrests, panel = panel.smooth, main = "USArrests data") 


# 27/122

no.group <- 5
no.iter <- 20
USArrests.kmeans <- kmeans(USArrests, no.group, no.iter)
plot(USArrests, col = USArrests.kmeans$cluster, 
     main = "K-means: USArrests data")

## PCA
USArrests.pca <- princomp(USArrests, cor=TRUE, scores=TRUE)
pca.dim1 <- USArrests.pca$scores[,1]
pca.dim2 <- USArrests.pca$scores[,2]
plot(pca.dim1, pca.dim2,  main="PCA for USArrests Data with K-means", xlab="PCA-1", ylab="PCA-2", 
     col=USArrests.kmeans$cluster)

## MDS
USArrests.mds<- cmdscale(dist(USArrests))
mds.dim1 <- USArrests.mds[,1]
mds.dim2 <- USArrests.mds[,2]
plot(mds.dim1, mds.dim2, xlab="MDS-1", ylab="MDS-2", 
     main="MDS for USArrests Data with K-means", col = USArrests.kmeans$cluster)


# 35/122

# install.packages("vcd")
library(vcd) # Visualizing Categorical Data
?Arthritis # 關節炎Treatment Data
data(Arthritis)
str(Arthritis)
head(Arthritis) 


# 36/122

# install.packages("klaR")
library(klaR) # Classification and visualization
res <- kmodes(Arthritis[,c(2,3,5)], modes=3)
res

table(Arthritis[,c(2,3,5)])


# 38/122

iris.pam <- pam(iris[,1:4], k=3)
iris.pam
plot(iris.pam)

iris.pam <- pam(iris[,1:4], 3)$clustering
clusplot(iris[, 1:4], iris.pam, color = TRUE)


# 39/122

require(ggplot2)
data(diamonds)
dim(diamonds)
head(diamonds)
system.time(
  clara.res <- clara(diamonds[, -(2:4)], k=10, stand = T, samples = 500, pamLike = TRUE)
)
clara.res


# 40/122

system.time(
   clara.res <- clara(diamonds[, -(2:4)], k=10, stand=T, samples=500, pamLike=TRUE))   
clara.res


# 44/122

iris.fc <- cmeans(iris[,1:4], centers=3)
names(iris.fc)
# Visualize using corrplot
library(corrplot) # A visualization of a correlation matrix

id <- c(1:3, 51:53, 101:103)
corrplot(iris.fc$membership[id,], is.corr = FALSE)
corrplot(iris.fc$membership[id,], method = "number", is.corr = FALSE)
iris.fc$cluster


# 54/122

gene.name <-  rownames(cell.matrix)
## Hierarchical Clustering on genes
cell.gene.hc.ave <- hclust(dist(cell.sdata), method = "ave")
plot(cell.gene.hc.ave, hang = -1, cex=0.5, labels=gene.name)
## Hierarchical Clustering on experiments
cell.exp.hc.ave <- hclust(dist(t(cell.sdata)), method = "ave")
plot(cell.exp.hc.ave, cex=0.8)


# 55/122

x  <- as.matrix(mtcars)
?heatmap
row.color <- rainbow(nrow(x), start=0, end=.3)
col.color <- rainbow(ncol(x), start=0, end=.3)
hv <- heatmap(x, col = cm.colors(256), scale="column", 
              RowSideColors = row.color, ColSideColors = col.color, 
              margins=c(5,10),
              xlab = "specification variables", ylab= "Car Models",
              main = "Heatmap(mtcars)(Range Column Condition)")
names(hv)


# 56/122

cell.matrix  <- read.table("YeastCellCycle_alpha.txt", header=TRUE, row.names=1)
n <- dim(cell.matrix)[1]
p <- dim(cell.matrix)[2]-1
cell.data <- cell.matrix[,2:p+1]
gene.phase <- cell.matrix[,1]
phase <- unique(gene.phase)
phase.name <- c("G1", "S", "S/G2", "G2/M", "M/G1")
cell.sdata <- scale(cell.data)
GBRcol <- color.Palette(low="green", mid="black", high="red")
rc <- rainbow(5)[as.integer(gene.phase)]
cc <- rainbow(ncol(cell.sdata))
hv <- heatmap(cell.sdata, col = GBRcol, scale = "column", Colv=NA, Rowv=NULL,
              RowSideColors = rc, 
              ColSideColors = cc, margins = c(5,10),
              xlab = "Times", ylab =  "Genes",
              main = "Heatmap of Microarray Data")


# 57/122

# install.packages("fields")
library(fields)
gbr <- two.colors(start="green", middle="black", end="red")
cell.raw <- read.table("trad_alpha103.txt", row.names=1, header=T)
cell.data <- t(scale(t(cell.raw[,2:19]), center=T, scale=T))    
n <- nrow(cell.data)
p <- ncol(cell.data)
gene.phase <- cell.raw[,1]
range(cell.data)
cell.data[cell.data > 2.802712] <- 2.802712 
cellcycle.color <- c("darkgreen", "blue", "red", "gray50", "orange")
rc <- cellcycle.color[gene.phase+1]
cc <- rainbow(ncol(cell.data))

hv1 <- heatmap(cell.data[n:1,], col = gbr, Colv=NA, Rowv=NA,
               RowSideColors = rc, 
               ColSideColors = cc, margins = c(5,10),
               xlab = "Times", ylab =  "Genes",main = "Heatmap of Microarray Data")

hv2 <- heatmap(cell.data, col = gbr, Colv=NA, Rowv=NULL,
               RowSideColors = rc, 
               ColSideColors = cc, margins = c(5,10),
               xlab = "Times", ylab =  "Genes",main = "Heatmap of Microarray Data")

dd <- as.dendrogram(hclust(as.dist(1-cor(t(cell.data)))))
hv3 <- heatmap(cell.data, col = gbr, Colv=NA, Rowv=dd,
              RowSideColors = rc, 
              ColSideColors = cc, margins = c(5,10),
              scale = "row",
              xlab = "Times", ylab =  "Genes",main = "Heatmap of Microarray Data")


# 59/122

install.packages("gplots")
library(gplots)
?heatmap.2
library(affy)
data(SpikeIn)
pms <- SpikeIn@pm

# just the data, scaled across rows
heatmap.2(pms, col=rev(heat.colors(16)), main="SpikeIn@pm", 
              xlab="Relative Concentration", ylab="Probeset", 
              scale="row")

# fold change vs "12.50" sample
data <- pms / pms[, "12.50"]
data <- ifelse(data>1, data, -1/data)
heatmap.2(data, breaks=16, col=redgreen, tracecol="blue", 
          main="SpikeIn@pm Fold Changes\nrelative to 12.50 sample",                    
          xlab="Relative Concentration", ylab="Probeset")


# 71/122

library(som)
cell.som <- som(cell.sdata, xdim=5, ydim=4, topol="rect", neigh="gaussian")
plot(cell.som) 


# 89/122

kmeans(iris[, 1:4], 3)


#  90/122

library(cluster)
x <- iris[, 1:4]
gskmn <- clusGap(x, FUN = kmeans, nstart = 20, K.max = 8, B = 60)

plot(gskmn, main = "clusGap(., FUN = kmeans, n.start=20, B= 60)")


# 104/122

library(clValid)
data(mouse)
head(mouse)

# internal validation
express <- mouse[1:25,c("M1", "M2", "M3", "NC1", "NC2", "NC3")]
rownames(express) <- mouse$ID[1:25]
head(express)

intern <- clValid(express, 2:6, clMethods=c("hierarchical", "kmeans", "pam"),
                  validation="internal")


# 105/122

summary(intern) # view results
par(mfrow=c(1, 3))
plot(intern)  


# 106/122

stab <- clValid(express, 2:6, clMethods=c("hierarchical","kmeans","pam"),
                validation="stability") 

summary(stab)
par(mfrow=c(4, 1))
plot(stab)


# 117/122

# biological measures (functional classes predetermined)
fc <- tapply(rownames(express), mouse$FC[1:25], c)
fc


# 118/122
fc <- fc[-match( c("EST","Unknown"), names(fc))]
bio <- clValid(express, 2:6, clMethods=c("hierarchical", "kmeans", "pam"),
               validation="biological", annotation=fc)
summary(bio)
par(mfrow=c(2, 1))
plot(bio)