#################################################
# C03: 分類法則 Classification Rules            #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################


# 6/75
splitdf <- function(df, train.ratio, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  index <- 1:nrow(df)
  id <- sample(index, trunc(length(index)*train.ratio))
  train <- df[id, ]
  test <- df[-id, ]
  list(trainset = train,testset = test)
}

splits <- splitdf(iris, 0.9, 12345)
lapply(splits, dim)
iris.training <- splits$trainset
iris.testing <- splits$testset

library(dplyr)
iris.train <- sample_frac(iris, 0.9)
id <- as.numeric(rownames(iris.train)) 
iris.test <- iris[-id, ]



# 7/75
library(caTools)
Y <- iris[,5] # extract labels from the data
msk <- sample.split(Y, SplitRatio = 4/5)
msk
table(Y, msk)
iris.train <- iris[msk, ] 
iris.test <- iris[!msk, ]  
dim(iris.train) 
dim(iris.test)

require(caTools)
set.seed(12345) 
id <- sample.split(1:nrow(iris), SplitRatio = 0.90)
iris.train <- subset(iris, id == TRUE)
iris.test <- subset(iris, id == FALSE)


library(caret)
id <- createDataPartition(y = iris$Species, p = 0.9, list = FALSE)
iris.train <- iris[id, ]
iris.test <- iris[-id, ]



# 12/75
library(ROCR)
data(ROCR.simple)
ROCR.simple
pred <- prediction(ROCR.simple$predictions, ROCR.simple$labels)
pred
str(pred)


# 14/75
perf <- performance(pred, measure = "tpr", x.measure = "fpr") # predictor evaluation
perf
str(perf)
plot(perf, colorize = TRUE)



# 15/75
# ROC curve 
roc <- performance(pred, "tpr", "fpr")
roc
plot(roc)

perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, colorize = TRUE, lwd = 3, asp = 1, main = "ROC curve")

# precision/recall curve (x-axis: recall, y-axis: precision)
prec.rec <- performance(pred, "prec", "rec")
plot(prec.rec, lwd = 2, main = "Precision/Recall curve")

# sensitivity/specificity curve (x-axis: specificity, y-axis: sensitivity)
ss <- performance(pred, "sens", "spec")
plot(ss, lwd = 3, main = "Sensitivity/Specificity curve")

# Lift chart (x-axis: rpp, y-axis: lift)
lift <- performance(pred, "lift", "rpp")
plot(lift, lwd = 3, main = "Lift chart")



# 16/75
demo(ROCR)
acc <- performance(pred, measure = "acc")
str(acc)
auc <- performance(pred, measure = "auc")
str(auc)
auc@y.values[[1]]



# 17/75
# mlbench: Machine Learning Benchmark Problems
#install.packages("mlbench")
library(mlbench)
data(BreastCancer) # Wisconsin Breast Cancer Database
dim(BreastCancer)  # 699 x 11
levels(BreastCancer$Class) # "benign(良性)", "malignant(惡性)"
head(BreastCancer)
? BreastCancer
summary(BreastCancer)


x <- as.data.frame(lapply(BreastCancer[, -c(1, 7:11)], as.numeric))
dim(x) # 699   5
y <- BreastCancer$Class 
n <- nrow(x)
p <- ncol(x)
id <- sample(1:n, n*0.8)
length(id)
x.train <- x[id, ]
x.test <- x[-id, ]
y.train <- y[id]
y.test <- y[-id]

library(e1071)
model <- svm(x.train, y.train) 
summary(model)
pred.1 <- predict(model, x.test)
table(y.test, pred.1)

library(caret)
confusionMatrix(pred.1, y.test)


# compute decision values and probabilities:
pred.2 <- predict(model, x.test, decision.values = TRUE)
str(pred.2)
pred.values <- attr(pred.2, "decision.values")
head(pred.values)

pred.svm <- prediction(-pred.values, y.test) # Levels: benign < malignant
perf.svm <- performance(pred.svm, "tpr", "fpr")
plot(perf.svm, lwd = 3, main = "ROC Curve")
abline(a = 0, b = 1, col = "grey")
legend(0.6, 0.4, "svm", lty = 1, lwd = 3)
auc <- performance(pred.svm, measure = "auc")
auc@y.values[[1]]



# 33/75
#install.packages("TH.data")
library(TH.data)
data("bodyfat", package = "TH.data")
? bodyfat
dim(bodyfat)
head(bodyfat)




# 34/75
set.seed(1234)
id <- sample(2, nrow(bodyfat), replace = TRUE, prob = c(0.7, 0.3))
bodyfat.train <- bodyfat[id == 1,]
bodyfat.test <- bodyfat[id == 2,]
# train a decision tree
library(rpart)
my.model <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth
bodyfat.rpart <- rpart(my.model, data = bodyfat.train, control = rpart.control(minsplit = 10))
bodyfat.rpart 



# 35/75
attributes(bodyfat.rpart)
str(bodyfat.rpart)



# 36/75
print(bodyfat.rpart$cptable)

plot(bodyfat.rpart)

text(bodyfat.rpart, use.n = T)



# 37/75
#select the tree with the minimum prediction error
opt <- which.min(bodyfat.rpart$cptable[,"xerror"])
opt
cp <- bodyfat.rpart$cptable[opt, "CP"]
cp
bodyfat.prune <- prune(bodyfat.rpart, cp = cp)
bodyfat.prune 

plot(bodyfat.prune)
text(bodyfat.prune, use.n = T, cex = 0.7)



# 38/75
DEXfat.pred <- predict(bodyfat.prune, newdata = bodyfat.test)
xlim <- range(bodyfat$DEXfat)
plot(DEXfat.pred ~ DEXfat, data = bodyfat.test, xlab = "Observed", ylab = "Predicted", ylim = xlim, xlim = xlim)
abline(a = 0, b = 1)



#  40/75
library(randomForest)
id <- sample(2, nrow(iris), replace = TRUE, prob = c(0.9, 0.1))
iris.train <- iris[id == 1, ]
iris.test <- iris[id == 2, ]
iris.rf <- randomForest(Species ~ ., data = iris.train, ntree = 100, proximity = TRUE)
iris.rf

attributes(iris.rf)


# 41/75
table(predict(iris.rf), iris.train$Species)

head(iris.rf$err.rate)

plot(iris.rf, lwd = 2)
legend("topright", colnames(iris.rf$err.rate), 
       lty = 1:4, lwd = 2, col = 1:4)

#The importance of variables can be obtained 
# with functions importance() and varImpPlot().
importance(iris.rf)

varImpPlot(iris.rf)



# 42/75
iris.pred <- predict(iris.rf, newdata = iris.test)
table(iris.pred, iris.test$Species)
sort(margin(iris.rf, iris.test$Species))
plot(margin(iris.rf, iris.test$Species))
legend("bottomright", levels(iris.test$Species), col = c("red","blue","green"), pch = 15)



# 57/75
library(e1071)
attach(iris)
x <- subset(iris, select = -Species)
y <- Species

# use default setting (?svm)
model <- svm(x, y) 
print(model)
summary(model)

# test with train data and report accuracy
pred <- predict(model, x)
table(pred, y)

sum(diag(table(pred, y)))/length(y)



# 68/75
# install.packages("xgboost")
# install.packages("Ckmeans.1d.dp")
# install.packages("DiagrammeR")
library(xgboost)
library(Ckmeans.1d.dp)
library(DiagrammeR)

# training set and testing test
id <- sample(2, nrow(iris), replace = TRUE, prob = c(0.8, 0.2))
iris.train <- iris[id == 1, ]
iris.test <- iris[id == 2, ]

iris.train.x <- as.matrix(iris.train[, -5])
iris.train.y <- as.integer(iris.train[, 5]) - 1  # 0, 1, ...
iris.test.x <- as.matrix(iris.test[, -5])
iris.test.y <- as.integer(iris.test[, 5]) - 1

dim(iris.train.x)
dim(iris.test.x)

# conver data.frame to DMatrix for xgboost
iris.DM.train <- xgb.DMatrix(data = iris.train.x, label = iris.train.y)
iris.DM.test <- xgb.DMatrix(data = iris.test.x, label = iris.test.y)
str(iris.DM.train)
no.class <- length(unique(iris.train.y))



# 69/75
watchlist <- list(train = iris.DM.train, eval = iris.DM.test)
param <- list(booster = "gbtree", 
              objective = "multi:softprob", 
              eval_metric = "mlogloss",
              num_class = no.class)


?xgb.train # could custom objective and evaluation metric

## build the model using "xgb.train"
xgb.result <- xgb.train(param, iris.DM.train, nrounds = 5, watchlist)
str(xgb.result)

# or build the model using "xgboost"
xgb.model <- xgboost(param = param, data = iris.DM.train, nrounds = 20)



# 70/75
# get the prediction for testing data
Ypred <- predict(xgb.model, iris.DM.test)
Ypred <- t(matrix(Ypred, no.class, length(Ypred)/no.class))
at <- apply(Ypred, 1, which.max)
Ypred <- factor(levels(iris$Species)[at], levels = levels(iris$Species))

# confusion matrix
cm <- table(Ytest = iris.test[,5], Ypred)
cm

# accuracy
sum(diag(cm))/sum(cm)

# variable importance
imp <- xgb.importance(names(iris.train.x), model = xgb.model)
print(imp)
xgb.plot.importance(imp)
xgb.ggplot.importance(imp)
xgb.plot.tree(feature_names = names(iris[,-5]), model = xgb.model, 
              trees = 0:2, show_node_id = TRUE)




# 73/75
#install.packages("caret")
library(caret)
?train
names(getModelInfo())



# 74/75
library(AppliedPredictiveModeling)
library(caret)
library(pROC)

featurePlot(x = iris[, 1:4], y = iris$Species, 
            plot = "ellipse",
            auto.key = list(columns = 3))


id <- createDataPartition(iris$Species, p = 0.8, list = FALSE, times = 1)
iris.train <- iris[id, ]
iris.test  <- iris[-id, ]
dim(iris.train)
dim(iris.test)

my.preProcess <- c("center", "scale")
performance.metric <- "Accuracy"

set.seed(12345)
cv.10fold <- trainControl(method = "cv", number = 10)

rpart.model <- train(Species ~ ., data = iris.train, method = "rpart", 
                     metric = performance.metric,
                     trControl = cv.10fold, preProcess = my.preProcess)
rpart.model
iris.rpart.pred <- predict(rpart.model, iris.test)
confusionMatrix(iris.rpart.pred, iris.test$Species)
plot(varImp(rpart.model))



# 75/75
rf.model <- train(Species ~ ., data = iris.train, method = "rf",
                  metric = performance.metric,
                  trControl = cv.10fold, preProcess = my.preProcess)

svm.model <- train(Species~., data = iris.train, method = "svmRadial", 
                   metric = performance.metric,
                   trControl = cv.10fold, preProcess = my.preProcess)

results.iris <- resamples(list(rpart = rpart.model, rf = rf.model, svm = svm.model))
summary(results.iris)
dotplot(results.iris)



# Parameter Tuning: Grid Search: parameter mtry
# mtry: Number of variables randomly sampled as candidates at each split of the tree
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "grid")
tunegrid <- expand.grid(mtry = c(1:15))
rf.gridsearch <- train(Species~., data = iris.train, method = "rf", 
                       metric = performance.metric, tuneGrid = tunegrid, 
                       trControl = control, preProcess = c("center", "scale"))
print(rf.gridsearch)
plot(rf.gridsearch)