#################################################
# F01: 探索式資料分析與維度縮減                 #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################

# 19/82
browseURL("https://dzone.com/articles/text-analysis-of-trumps-tweets-confirms-he-writes")



# 22/82
head(anscombe, 3)
apply(anscombe, 2, mean)
apply(anscombe, 2, sd)
mapply(cor, anscombe[,1:4], anscombe[,5:8])
mapply(function(x, y) lm(y~x)$coefficients, anscombe[, 1:4], anscombe[, 5:8])


par(mfrow = c(2, 2))
regplot <- function(x, y){ 
  plot(y~x) 
  abline(lm(y~x), col = "red")
}
mapply(regplot, anscombe[, 1:4], anscombe[, 5:8])



# 23/82
#install.packages("datasauRus")
browseURL("https://cran.r-project.org/web/packages/datasauRus/vignettes/Datasaurus.html")



# 33/82
library(rgl)
open3d()
x <- sort(rnorm(1000))
y <- rnorm(1000)
z <- rnorm(1000) + atan2(x,y)
plot3d(x, y, z, col = rainbow(1000), size = 2)

M <- par3d("userMatrix")
play3d(par3dinterp(userMatrix = list(M, rotate3d(M, pi/2, 1, 0, 0), rotate3d(M, pi/2, 0, 1, 0) ) ), duration = 4)



# 34/82
open3d()
plot3d(iris[,1:3], col = as.integer(iris[,5])+1, type  = "p", size = 10)

plot3d(iris[,1:3], col = as.integer(iris[,5])+1, type  = "s", radius = 0.15)
# Add bounding box decoration
bbox3d(color = c("red", "black"), emission = "gray", specular = "yellow", 
       shininess = 5, alpha = 0.8, nticks = 3) 
aspect3d(1,1,1)

lines3d(iris[c(1, 150), 1:3], col = "purple", lwd = 2)

shapes <- list(cube3d(), tetrahedron3d(), octahedron3d(), icosahedron3d(), dodecahedron3d(), cuboctahedron3d())
shapelist3d(shapes, x = 1, y = 1:6, z = 1, size = 0.3, col = 1:6)
aspect3d(1,1,1)

texts3d(x = 2, y = 6, z = 6, texts = "rgl Example", font = 2, color = "blue", cex = 2, family = "serif")

# Show regression plane with z as dependent variable
fit <- lm(iris[,3] ~ iris[,1] + iris[,2])
coefs <- coef(fit)
a <- coefs[2] # x
b <- coefs[3] # y
c <- -1 # z
d <- coefs["(Intercept)"]
planes3d(a, b, c, d, alpha = 0.5) # planes3d draws planes using the parametrization a x + b y + c z + d = 0.

play3d(spin3d(axis = c(0, 0, 1), rpm = 20), duration = 4)



# 35/82
library(rgl)
open3d()
comet <- readOBJ(url("http://sci.esa.int/science-e/www/object/doc.cfm?fobjectid = 54726"))
class(comet)
str(comet)
shade3d(comet, col = "gray")

# export graphics
rgl.snapshot("test.png")
rgl.postscript("test.pdf", "pdf")


library(rgl)
x <- sort(rnorm(100))
y <- rnorm(100)
z <- rnorm(100) + atan2(x,y)
open3d(windowRect = c(100, 100, 700, 700))
plot3d(x, y, z, col = rainbow(100))
#方法一，壓縮圖檔，會失真 
rgl.snapshot("test.png")
#方法二，向量檔，但有時不能完成將圖成功匯出
rgl.postscript("persp3d.pdf", "pdf")
getwd()
dir()



# 38/44
library(ggplot2)
library(maps)
library(mapproj)
states.map <- map_data("state")
head(states.map, 3)
tail(states.map, 3)
ggplot(states.map, aes(x = long, y = lat, group = group)) + 
  geom_polygon(fill = "white", colour = "black")
ggplot(states.map, aes(x = long, y = lat, group = group)) +
  geom_path() + coord_map("mercator")



# 39/44
world.map <- map_data("world")
sort(unique(world.map$region))
country <- c("France", "Austria", "Italy", "Switzerland", "Germany", "Spain", "Czech Republic")
mymapdata <- map_data("world", region = country)
ggplot(mymapdata, aes(x = long, y = lat, group = group, fill = region)) + 
  geom_polygon(colour = "black") + 
  scale_fill_brewer(palette = "Set2")



# 40/44
head(USArrests, 3)
crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests)
head(crimes, 3)
states.map <- map_data("state")
head(states.map, 3)
crime.map <- merge(states.map, crimes, by.x = "region", by.y = "state")
head(crime.map, 3)



# 41/44
library(plyr) 
crime.map <- arrange(crime.map, group, order)
head(crime.map, 3)
ggplot(crime.map, aes(x = long, y = lat, group = group, fill = Assault)) +
  geom_polygon(colour = "black") +
  coord_map("polyconic")

ggplot(crime.map, aes(x = long, y = lat, group = group, fill = Assault)) +
    geom_polygon(colour = "black") +
    coord_map("polyconic") +
    scale_fill_gradient2(low = "blue", mid = "grey", high = "red",
                         midpoint = median(crimes$Assault)) 



# 45/82
iris.sub <- iris[sample(1:150, 8),1:4]
iris.sub
M.svd <- svd(iris.sub)
M.svd
M.svd$u %*% (diag(M.svd$d) %*% t(M.svd$v)) 

d.sub <- diag(M.svd$d[1:2])   
u.sub <- as.matrix(M.svd$u[, 1:2])
v.sub <- as.matrix(M.svd$v[, 1:2])
iris.sub.approx <- u.sub %*% d.sub %*% t(v.sub)
iris.sub.approx

sum((iris.sub - iris.sub.approx)^2)


# 46/82
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("EBImage")

library(EBImage) #(Repositories: BioC Software)
lena <- readImage("lena.jpg")
dims <- dim(lena)
dims

plot(c(0, dims[1]), c(0, dims[2]), type = 'n', xlab = "", ylab = "")
rasterImage(lena, 0, 0, dims[1], dims[2])

library(jpeg)
lena <- readJPEG("data/lena.jpg")



# 47/82
lena.flip <- Image(flip(lena))
red.weight   <- .2989
green.weight <- .587
blue.weight  <- 0.114

lena.gray <- red.weight * imageData(lena.flip)[,,1] + 
  green.weight * imageData(lena.flip)[,,2] + 
  blue.weight  * imageData(lena.flip)[,,3]
dim(lena.gray)
lena.gray[1:5, 1:5]
image(lena.gray, col = grey(seq(0, 1, length = 256)))



# 48/82
lena.svd <- svd(lena.gray)
d <- diag(lena.svd$d)
dim(d)
u <- lena.svd$u
v <- lena.svd$v
plot(1:length(lena.svd$d), lena.svd$d, pch = 19, xlab = "i-th lena.svd$d", ylab = "lena.svd$d")

used.no <- 20
u.sub <- as.matrix(u[, 1:used.no])
v.sub <- as.matrix(v[, 1:used.no])
d.sub <- as.matrix(d[1:used.no, 1:used.no])
lena.approx <- u.sub %*% d.sub %*% t(v.sub)
image(lena.approx, col = grey(seq(0, 1, length = 256)))



# 52/82
x <- iris[, 1:4]
(covx <- cov(x))
e <- eigen(covx)
V <- e$vectors
V.inverse <- solve(e$vectors)
covx.hat <- V %*% diag(e$values) %*% V.inverse
covx.hat # same with covx



# 57/82
cell.matrix  <- read.table("data/YeastCellCycle_alpha.txt", header = TRUE, row.names = 1)
n <- dim(cell.matrix)[1]
p <- dim(cell.matrix)[2]-1
cell.data <- cell.matrix[,2:p+1]
gene.phase <- cell.matrix[,1]
phase <- unique(gene.phase)
phase.name <- c("G1", "S", "S/G2", "G2/M", "M/G1")
cell.sdata <- t(scale(t(cell.data))) 
rc <- rainbow(5)[as.integer(gene.phase)]
cc <- rainbow(ncol(cell.sdata))
hv <- heatmap(cell.sdata, col = GBRcol, scale = "column", Colv = NA, Rowv = NA,
              RowSideColors = rc, ColSideColors = cc, margins = c(5,10),
              xlab = "Times", ylab =  "Genes", main = "Heatmap of Microarray Data")



# 58/82
cell.pca <- princomp(cell.sdata, cor = TRUE, scores = TRUE)
# 2D plot for first two components
pca.dim1 <- cell.pca$scores[,1]
pca.dim2 <- cell.pca$scores[,2]
plot(pca.dim1, pca.dim2, main = "PCA for Cell Cycle Data on Genes", 
     xlab = "1st PCA Component", ylab = "2nd PCA Component",col = c(phase), pch = c(phase))
legend(3, 4, phase.name, pch = c(phase), col = c(phase))

# shows a screeplot.
plot(cell.pca) 
biplot(cell.pca)



# 59/82
# loadings plot
plot(loadings(cell.pca)[,1], loadings(cell.pca)[,2], xlab = "1st PCA", 
     ylab = "2nd PCA", main = "Loadings Plot", type = "n")
text(loadings(cell.pca)[,1], loadings(cell.pca)[,2], labels = paste(1:p))
abline(h = 0)
abline(v = 0)



# 62/82
cell.matrix  <- read.table("YeastCellCycle_alpha.txt", header = TRUE, row.names = 1)
n <- dim(cell.matrix)[1]
p <- dim(cell.matrix)[2]-1
cell.data <- cell.matrix[,2:p+1]
gene.phase <- cell.matrix[,1]
phase.name <- c("G1", "S", "S/G2", "G2/M", "M/G1")
cell.sdata <- t(scale(t(cell.data))) 
cell.cor <- cor(t(cell.sdata)) 
cell.dist <- sqrt(2*(1-cell.cor)) 
cell.mds <- cmdscale(cell.dist)
plot(cell.mds[,1], cell.mds[,2], type = "n", xlab = "MDS-1", ylab = "MDS-2", main = "MDS for Cell Cycle Data")
number <- c(1, 4, 5, 2, 3)[as.integer(gene.phase)]
phase.color <- c("green", "blue", "red", "gray", "yellow")
text(cell.mds[,1], cell.mds[,2], number, col =  phase.color[number])
legend(-0.7, 1.0, phase.name, pch = "01234", col = phase.color) 
 


# 73/82
swissroll <- function(n, sigma = 0.05){
  
  angle <- (3*pi/2)*(1+2*runif(n)); 
  height <- runif(n);
  xdata <-  cbind(angle*cos(angle), height, angle*sin(angle))
  
  # angle <-  seq((1.5*pi): (4.5*pi), length.out = n);
  # height <- sample(0:21, n, replace = TRUE);
  # xdata <-  cbind(angle*cos(angle), height, angle*sin(angle))
  
  xdata <- scale(xdata) + matrix(rnorm(n*3, 0, sigma), n, 3)
  
  order.angle <- order(angle)
  sort.angle <- sort(order.angle, index.return = TRUE)	
  col.id <- rainbow130(n)
  my.color <- col.id[sort.angle$ix]
  
  colnames(xdata) <- paste("x", 1:3, sep = "")
  
  list(xdata = xdata, angle = angle, color = my.color)
}


source("rainbow130.r")
sr <-  swissroll(800)
library(rgl); open3d()
plot3d(sr$xdata[,1], sr$xdata[,2], sr$xdata[,3], col = sr$color, size = 3,
       xlab = "", ylab = "", zlab = "", axes = T)	  
library(vegan)
sr.isomap <-  isomap(dist(sr$xdata), ndim = 2, k = 7) # try different k
plot(sr.isomap, col = sr$color)

 

# 78/82
library(coRanking)
LCMC


my.coranking <- function(X.high, X.low){
  # X.high <- iris[1:10,1:4]
  # X.low <- princomp(X.high)$score[,1:2]

  D.high <- as.matrix(dist(X.high)) 
  D.low <- as.matrix(dist(X.low))
  f <- function(x){
     rank(x, ties.method = 'first', na.last = FALSE)
  }
  diag(D.high) <- NA # NA is rank 1
  diag(D.low) <- NA
  R.high <- apply(D.high, 1, f) 
  R.low <- apply(D.low, 1, f) 
  table(R.high, R.low)[-1, -1]
}



# 80/82
library(coRanking) # install.packages("coRanking")
library(MASS)
par(mfrow = c(2, 3))
x <- iris[,1:4]
y <- iris[,5]
pca <- princomp(x)
Q.pca <- coranking(x, pca$score[,1:2], input = "data")
imageplot(Q.pca)
lcmc.pca <- LCMC(Q.pca, K = 5:10)

mds <- cmdscale(dist(x))
Q.mds <- coranking(x, mds[,1:2], input = "data")
imageplot(Q.mds)
lcmc.mds <- LCMC(Q.pca, K = 5:10)

mylda <- lda(x, grouping = y)
lda.dim <- as.matrix(x)%*%mylda$scaling[,1:2]
Q.lda <- coranking(x, lda.dim, input = "data")
imageplot(Q.lda)
lcmc.lda <- LCMC(Q.lda, K = 5:10)

names(lcmc.pca) <- paste0("K = ", 5:10)
rbind(lcmc.pca, lcmc.mds, lcmc.lda)

plot(pca$score[,1:2], col = as.integer(y)+1, main = "PCA")
plot(mds[,1:2], col = as.integer(y)+1, main = "MDS")
plot(lda.dim[,1:2], col = as.integer(y)+1, main = "LDA")

