#################################################
# B01-2: 遺失值處理/離群值處理/資料轉換         #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################

# 10/61
x <- c(1, 0, 10)
x / x
is.nan(x / x)

1 / x
is.finite(1 / x)
- 10 / x
is.infinite(-10 / x)

exp(-Inf)
0 / Inf
Inf - Inf
Inf / Inf


# 13/61
head(airquality)
dim(airquality)
mydata[4:10, 3] <- rep(NA, 7)
mydata[1:5, 4] <- NA
summary(mydata)


# 14/61
library(mice)
md.pattern(mydata)

library(VIM)
mydata.aggrplot <- aggr(mydata, col = c('lightblue', 'red'), numbers = TRUE, 
                        prop = TRUE, sortVars = TRUE, labels = names(mydata), 
                        cex.axis = 0.7, gap = 3)


# 15/61
matrixplot(mydata)


# 16/61
md.pairs(mydata)


# 17/61
marginplot(mydata[, c("Ozone", "Solar.R")], col = c("blue", "red"))


# 18/61
mdata <- matrix(rnorm(15), nrow = 5)
mdata[sample(1:15, 4)] <- NA
mdata <- as.data.frame(mdata)
mdata
(x1 <- na.omit(mdata))
(x2 <- mdata[complete.cases(mdata),])
mdata[!complete.cases(mdata),]


# 19/61
mdata
cov(mdata)
cov(mdata, use = "all.obs")
cov(mdata, use = "complete.obs")

cov(mdata, use = "na.or.complete")
cov(mdata, use = "pairwise")


# 20/61
mean.subst <- function(x) {
  x[is.na(x)] <- mean(x, na.rm = TRUE)
  x
}

mdata
mdata.mip <- apply(mdata, 2, mean.subst)
mdata.mip


# 22/61
names(airquality)
airquality.imp.median <- kNN(airquality[1:4], k = 5)
head(airquality.imp.median)


# 23/61
matrixplot(airquality[1:4], interactive = F, main = "airquality")
matrixplot(airquality.imp.median[1:4],
           interactive = F,
           main = "imputed by median")

trim_mean <- function(x) {
  mean(x, trim = 0.1)
}

airquality.imp.tmean <-
  kNN(airquality[1:4], k = 5, numFun = trim_mean)


# 25/61
airquality.imp.lm <-
  regressionImp(Ozone ~ Wind + Temp, data = airquality)

data(sleep)
summary(sleep)

sleep.imp.lm <-
  regressionImp(Dream + NonD ~ BodyWgt + BrainWgt, data = sleep)
summary(sleep.imp.lm)



# 28/61
library(mice)
methods(mice)
? mice

# 29/61
# Generate 10% missing values at Random
iris.mis <- prodNA(iris, noNA = 0.1)  # library(missForest)
# Check missing values introduced in the data
summary(iris.mis)
iris.mis <- subset(iris.mis, select = -c(Species))
summary(iris.mis)

library(mice)
md.pattern(iris.mis)

library(VIM)
mice_plot <-
  aggr(
    iris.mis,
    col = c('navyblue', 'yellow'),
    numbers = TRUE,
    sortVars = TRUE,
    labels = names(iris.mis),
    cex.axis = 0.7,
    gap = 3,
    ylab = c("Missing data", "Pattern")
  )

#  Imputation
imputed.Data <- mice(iris.mis, m = 5, maxit = 50, method = 'pmm', seed = 500)

summary(imputed.Data)
# Check imputed values
imputed.Data$imp$Sepal.Width
# Get complete data (2nd out of 5)
completeData <- complete(imputed.Data, 2)
# Build predictive model
fit <-
  with(data = imputed.Data,
       exp = lm(Sepal.Width ~ Sepal.Length + Petal.Width))
# Combine results of all 5 models
combine <- pool(fit)
summary(combine)



# 32/61
library(outliers)
dim(mpg)
head(mpg, 3)
summary(mpg$hwy)
par(mfrow = c(2, 1))
plot(mpg$hwy, main = "index plot")
hist(mpg$hwy)

# 34/61
test <- grubbs.test(mpg$hwy)
test

test <- grubbs.test(mpg$hwy, opposite = TRUE)
test

dixon.test(mpg$hwy)
dixon.test(mpg$hwy, opposite = TRUE)


# 36/61
data(stackloss)
dim(stackloss)
head(stackloss, 4)
summary(stackloss)

# 37/61
library(MASS)
cov(stackloss)
cov.mve(stackloss)$cov

par(mfrow = c(2, 1))
library(MASS)
ccov <- cov(stackloss)
pca.scores <- as.matrix(stackloss) %*% eigen(ccov)$vectors[, 1:2]
plot(pca.scores, main = "PCA", asp = 1, type = "n")
text(pca.scores, label = 1:nrow(stackloss))
rcov <- cov.mve(stackloss)$cov
rpca.scores <- as.matrix(stackloss) %*% eigen(rcov)$vectors[, 1:2]
plot(rpca.scores, main = "Robust PCA", asp = 1, type = "n")
text(rpca.scores, label = 1:nrow(stackloss))


# 41/61
par(mfrow = c(1, 4))
raw.data <- 0:100
pa.data <- ifelse(raw.data >=  84, 1, 0)
id <- which(pa.data == 1)
plot(raw.data[id], pa.data[id], main = "present-absent", 
     type = "l", lwd = 2, col = "blue", ylim = c(-1, 2), xlim = c(0, 100))
points(raw.data[-id], pa.data[-id], type = "l", lwd = 2, col = "blue")
log.data <- log(raw.data)
plot(raw.data, log.data, main = "log", type = "l", lwd = 2, col = "blue")
sqrt10.data <- sqrt(raw.data)*10
plot(raw.data, sqrt10.data, main = "sqrt*10", type = "l", lwd = 2, col = "blue", asp = 1)
abline(a = 0, b = 1)
trun.data <- ifelse(raw.data >=  80, 80, ifelse(raw.data < 20, 20, raw.data))
plot(raw.data, trun.data, main = "truncation", type = "l", lwd = 2, col = "blue")


# 42/61
par(mfrow = c(1, 4))
raw.data <- 0:100
trans.data <- raw.data/max(raw.data)
plot(raw.data, trans.data, main = "/max", type = "l", lwd = 2, col = "blue")

trans.data <- raw.data/sum(raw.data)  #Species profile transformation
plot(raw.data, trans.data, main = "/sum", type = "l", lwd = 2, col = "blue")

trans.data <- raw.data/sqrt(sum(raw.data^2)) #length of 1, Chord transformation
plot(raw.data, trans.data, main = "norm (Chord)", type = "l", lwd = 2, col = "blue")

trans.data <- sqrt(raw.data/sum(raw.data)) #Hellinger transformation
plot(raw.data, trans.data, main = "Hellinger", type = "l", lwd = 2, col = "blue")



# 43/61
par(mfrow = c(1, 3)); set.seed(12345)
raw.data <- c(sample(0:60, 100, replace = T), sample(90:100, 5, replace = T))
rank.data <- rank(raw.data)
hist(raw.data, main = "raw")
hist(rank.data, main = "rank")
plot(raw.data, rank.data, main = "rank", lwd = 2, col = "blue")

# 45/61
x <- rnorm(50)
mycolor <- rainbow(150)[1:100]
z <- (x-min(x))/(max(x)-min(x))
plot(x, rep(1, length(x)), main = "range (0, 1)", type = "n", ylab = "", ylim = c(0.3, 1))
points(c(seq(min(x), max(x), length.out = 100)), rep(1, 100), col = mycolor, cex = 2, pch = 15)
text(0, 0.95, "color spectrum")
points(x, rep(0.8, length(x)), col = mycolor, cex = 2, pch = 15)
text(0, 0.75, "x, col = mycolor")
points(sort(x), rep(0.6, length(x)), col = mycolor, cex = 2, pch = 15)
text(0, 0.55, "sort(x), col = mycolor")
points(x, rep(0.4, length(x)), col = mycolor[floor(z*99)+1], cex = 2, pch = 15)
text(0, 0.35, "x, col = mycolor[floor(z*99)+1]")


# 46/61
library('R.matlab')
data <- readMat("software.mat")
print(data)
str(data)


# 47/61
plot(data$prepsloc, data$defsloc, xlab = "PrepTime(min)/SLOC", ylab = "Defects/SLOC", main = "Software Data")

plot(log(data$prepsloc), log(data$defsloc), xlab = "Log PrepTime/SLOC", 
     ylab = "Log Defects/SLOC", main = "Software Data")

plot(log(data$prepsloc), log(data$defsloc), xlab = "Log PrepTime/SLOC", 
     ylab = "Log Defects/SLOC", main = "Software Data", asp = 1)


# 48/61
logx <- function(x){
  log(x + 1 - min(x)) 
}

x <- runif(80, min = -5, max = 5) 
# x <- rnorm(80) 
x <- c(x, rnorm(20, mean = 20, sd = 10))
par(mfrow = c(1, 3))
hist(x, main = "x~runif")
plot(x, logx(x), main = "x vs logx")
hist(logx(x), main = "logx")


# 49/61
x <- seq(0.5, 2, length.out = 100)
bc <- function(y, lambda){
  (y^lambda -1)/lambda
} 
lambda <- seq(-2, 3, 0.5)
plot(0, 0, type = "n", xlim = c(0.5, 2), ylim = c(-2, 2.5), main = "Box-Cox transformation")
for(i in 1:length(lambda)){
  points(x, bc(x, lambda[i]), type = "l", col = i)
  points(2, bc(2, lambda[i]), col = i, pch = i)
}
legend(0.7, 2.5, legend = as.character(rev(lambda)), 
       lty = 1, pch = length(lambda):1, col = length(lambda):1)


# 50/61
x <- rexp(1000)
bc <- function(y, lambda) {
  (y ^ lambda - 1) / lambda
}
qqnorm(x)
qqline(x, col = "red")

bc1.x <- bc(x, 0.1)
qqnorm(bc1.x, main = "lambda = 0.1")
qqline(bc1.x, col = "red")
bc3.x <- bc(x, 0.5)
qqnorm(bc3.x, main = "lambda = 0.5")
qqline(bc3.x, col = "red")

bc2.x <- bc(x, 0.268)
qqnorm(bc2.x, main = "lambda = 0.268")
qqline(bc2.x, col = "red")

hist(x, main = "rexp(1000)")
hist(bc2.x, main = "lambda = 0.268")


# 53/61
x <- rpois(500, lambda = 1)
hist(x, main = "rpois(500, lambda = 1)")
z <- scale(x)
hist(z, main = "")


# 54/61
head(airquality)
r <- range(airquality[, 1:4], na.rm = T)
hist(airquality$Ozone , xlim = r)
hist(airquality$Solar.R, xlim = r)
hist(airquality$Wind, xlim = r)
hist(airquality$Temp, xlim = r)
airquality.std <- as.data.frame(apply(airquality, 2, scale))
r.std <- c(-3, 3)
hist(airquality.std$Ozone, xlim = r.std)
hist(airquality.std$Solar.R, xlim = r.std)
hist(airquality.std$Wind, xlim = r.std)
hist(airquality.std$Temp, xlim = r.std)


# 55/61
library(MASS)
data(crabs)


# 57/61
pairs(crabs[, 4:8],
      pch = as.integer(crabs$sex) + 1,
      col = c("blue", "orange")[as.integer(crabs$sp)])


# 58/61
par(mfrow = c(1, 2))
mp <- as.integer(crabs$sex)+1
mc <- c("blue", "orange")[as.integer(crabs$sp)]
isometric.size <- apply(crabs[, 4:8], 1, mean)
plot(isometric.size,  log(crabs$BD/crabs$RW), pch = mp, col = mc)
plot(isometric.size, log(crabs$CL/crabs$CW), pch = mp, col = mc)