#################################################
# C04: 關聯性分析 Association Rule Analysis     #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################


# 8/57
no.item <- 5
sum(choose(no.item, 0:no.item))

2^no.item


# 22/57
library(arules)
data(AdultUCI)
head(AdultUCI)

data(Adult)
?Adult #see how to create transactions from AdultUCI
Adult

class(Adult)

?transactions  



# 23/57
str(Adult)
inspect(Adult[1:2])



# 24/57
summary(Adult)



# 25/57
a.list <- list(
  c("a","b","c"),
  c("a","b"),
  c("a","b","d"),
  c("c","e"),
  c("a","b","d","e")
)
names(a.list) <- paste0("Customer", c(1:5))

a.list



# 26/57
alist.trans <- as(a.list, "transactions")
summary(alist.trans) # analyze transactions

image(alist.trans)



# 27/57
a.matrix <- matrix(c(
  1,1,1,0,0,
  1,1,0,0,0,
  1,1,0,1,0,
  0,0,1,0,1), ncol = 5) 
dimnames(a.matrix) <- list(paste("Customer", letters[1:4]),
                           paste0("Item", c(1:5)))
a.matrix

amatirx.trans <- as(a.matrix, "transactions")
amatirx.trans

inspect(amatirx.trans)

summary(amatirx.trans)



# 28/57
a.df <- data.frame(
  age   = as.factor(c(6, 8, NA, 9, 16)), 
  grade = as.factor(c("A", "C", "F", NA, "C")),
  pass  = c(TRUE, TRUE, FALSE, TRUE, TRUE))  
a.df

adf.trans <- as(a.df, "transactions") 
inspect(adf.trans)

as(adf.trans, "data.frame")

# creating transactions from (IDs, items) 
a.df2 <- data.frame(
  TID = c(1, 1, 2, 2, 2, 3), 
  item = c("a", "b", "a", "b", "c", "b"))
a.df2

a.df2.s <- split(a.df2[, "item"], a.df2[,"TID"])
a.df2.s

adf2.trans <- as(a.df2.s, "transactions")
inspect(adf2.trans)



# 29/57
data(AdultUCI)
summary(AdultUCI)
# remove attributes
AdultUCI[["fnlwgt"]] <- NULL
AdultUCI[["education-num"]] <- NULL



# 30/57
# map metric attributes
AdultUCI[["age"]] <- ordered(cut(AdultUCI[[ "age"]], c(15, 25, 45, 65, 100)),
                             labels = c("Young", "Middle-aged", "Senior", "Old"))
AdultUCI[["hours-per-week"]] <- ordered(cut(AdultUCI[["hours-per-week"]],
                                            c(0, 25, 40, 60, 168)),
                                        labels = c("Part-time", "Full-time", "Over-time", "Workaholic"))
AdultUCI[["capital-gain"]] <- ordered(cut(AdultUCI[["capital-gain"]],
                                          c(-Inf, 0, median(AdultUCI[["capital-gain"]][AdultUCI[["capital-gain"]] > 0]), Inf)), 
                                      labels = c("None", "Low", "High"))
AdultUCI[["capital-loss"]] <- ordered(cut(AdultUCI[["capital-loss"]],
                                          c(-Inf, 0, median(AdultUCI[["capital-loss"]][AdultUCI[["capital-loss"]] > 0]), Inf)), 
                                      labels = c("None", "Low", "High"))

summary(AdultUCI[c("age", "hours-per-week", "capital-gain", "capital-loss")])

# create transactions
MyAdult <- as(AdultUCI, "transactions")
MyAdult



# 31/57
summary(MyAdult)
inspect(MyAdult[1:2])



# 32/57
library(arules)
data(Groceries)
?Groceries
str(Groceries)
Groceries@itemInfo



# 33/57
summary(Groceries)
inspect(Groceries[1:4])



# 34/57
rule0 <- apriori(Groceries)



# 35/57
rule1 <- apriori(Groceries, parameter = list(support = 0.005, confidence = 0.64))
inspect(rule1)



# 36/57
str(rule1)
rule1@quality



# 37/57
rule2 <- apriori(Groceries, parameter = list(support = 0.001, confidence = 0.5))
rule2.sorted_sup <- sort(rule2, by = "support")
inspect(rule2.sorted_sup[1:5])



# 38/57
rule2.sub <- subset(rule2, subset = rhs %pin% "whole milk" & lift > 1.3)
rule2.sub

# Display the top 3 support rules
inspect(head(rule2.sub, n = 3, by = "support"))

# Display the first 3 rules
inspect(rule2.sub[1:3])

# Get labels for the first 3 rules
labels(rule2.sub[1:3])

labels(rule2.sub[1:3], itemSep = " + ", setStart = "", setEnd = "",  ruleSep = " ---> ")



# 39/57
rule2.sorted_con <- sort(rule2, by = "confidence")
inspect(rule2.sorted_con[1:5])

rule2.sorted_lift <- sort(rule2, by = "lift")
inspect(rule2.sorted_lift[1:5])



# 40/57
rule.freq_item <- apriori(Groceries, parameter = list(support = 0.001, target = "frequent itemsets"), control = list(sort = -1))
rule.freq_item
inspect(rule.freq_item[1:5])



# 41/57
rule.fi_eclat <- eclat(Groceries, parameter = list(minlen = 1, maxlen = 3, support = 0.001, target = "frequent itemsets"), control = list(sort = -1))
rule.fi_eclat
rule.fi_eclat <- eclat(Groceries, parameter = list(minlen = 3, maxlen = 5, support = 0.001, target = "frequent itemsets"), control = list(sort = -1))
rule.fi_eclat
inspect(rule.fi_eclat[1:5])



# 42/57
itemFrequencyPlot(Groceries, topN = 20)


# 43/57
Titanic


# 44/57
str(Titanic)
Titanic.df <- as.data.frame(Titanic)
Titanic.df
Titanic.raw <- NULL
for(i in 1:4) {
  Titanic.raw <- cbind(Titanic.raw, rep(as.character(Titanic.df[,i]), Titanic.df$Freq))
}

Titanic.raw <- as.data.frame(Titanic.raw)
names(Titanic.raw) <- names(Titanic.df)[1:4]
dim(Titanic.raw)
str(Titanic.raw)
head(Titanic.raw)

summary(Titanic.raw)


# 45/57
str(Titanic.raw)

head(Titanic.raw)

summary(Titanic.raw)


# 46/57
library(arules)
# find association rules with default settings
rules.all <- apriori(Titanic.raw)
quality(rules.all) <- round(quality(rules.all), digits = 3)
rules.all
inspect(rules.all) # or use arules::inspect(rules.all)


# 47/57
inspect(rules.all) # or use > arules::inspect(rules.all)


# 48/57
rules <- apriori(Titanic.raw, control  =  list(verbose = F),
                 parameter  =  list(minlen = 2, supp = 0.005, conf = 0.8),
                 appearance  =  list(rhs = c("Survived = No", "Survived = Yes"),
                                     default = "lhs"))

quality(rules) <- round(quality(rules), digits = 3)
# Rules are sorted by lift to make high-lift rules appear first
rules.sorted <- sort(rules, by = "lift")
inspect(rules.sorted)


# 50/57
subset.matrix <- is.subset(rules.sorted, rules.sorted)
subset.matrix[lower.tri(subset.matrix, diag = T)] <- NA
redundant <- colSums(subset.matrix, na.rm = T)  =  1
which(redundant)


#  51/57
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned)


# 52/57
rules <- apriori(Titanic.raw,
                 parameter  =  list(minlen = 3, supp = 0.002, conf = 0.2),
                 appearance  =  list(rhs = c("Survived = Yes"),
                                     lhs = c("Class = 1st", "Class = 2nd", "Class = 3rd",
                                             "Age = Child", "Age = Adult"),
                                     default = "none"),
                 control  =  list(verbose = F))
rules.sorted <- sort(rules, by = "confidence")
inspect(rules.sorted)


# 53/57
plot(rules.all)
plot(rules.all, cex = 2)
x <- rules.all@quality$support
y <- rules.all@quality$confidence
text(x, y, rownames(rules.all@quality))


# 54/57
plot(rules.all, method = "grouped")


# 55/57
plot(rules.all, method = "graph")
plot(rules.all, method = "graph",   
     control = list(type = "items"))


# 56/57
plot(rules.all, method = "paracoord", control = list(reorder = TRUE))