#################################################
# B05: 統計模型與迴歸分析                       #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################

# 8/71
y <- rnorm(50)
x1 <- rnorm(50)
x2 <- rnorm(50)
x3 <- rnorm(50)
lm(y ~ x1 + x2)
lm(y ~ x1 - 1)
lm(y ~ x1 * x2)

y <- rnorm(50)
school <- as.factor(sample(c("a", "b", "c"), 50, replace = T))
gender <- as.factor(sample(c("f", "m"), 50, replace = T))
table(school, gender)
lm(y ~ school / gender)
lm(y ~ gender / school)


# 9/71
lm(y ~ x1 | x2)

lm(y ~ x1:x2:x3)

xnam <- paste("x", 1:25, sep = "")
(fmla <- as.formula(paste("y ~ ", paste(xnam, collapse = "+"))))


# 10/71
y <- rnorm(50)
A <- rnorm(50)
B <- rnorm(50)
C <- rnorm(50)

lm(y ~ A * B * C)
lm(y ~ A / B / C)

lm(y ~ (A + B + C) ^ 3)
lm(y ~ (A + B + C) ^ 2)


# 15/71
dim(airquality)
head(airquality)

wind <- airquality$Wind
temp <- airquality$Temp
plot(temp, wind, main = "scatterplot of wind vs temp")


# 16/71
y <- airquality$Wind
x <- airquality$Temp
xbar <- mean(x)
xbar
ybar <- mean(y)
ybar
beta1.num <- sum((x - xbar) * (y - ybar))
beta1.den <- sum((x - xbar) ^ 2)
(beta1.hat <- beta1.num / beta1.den)
(beta0.hat <- ybar - beta1.hat * xbar)
yhat <- beta0.hat + beta1.hat * x

Sxy <- sum(y * (x - xbar))
Sxy
Sxx <- sum((x - xbar) ^ 2)
Sxx
Syy <- sum((y - ybar) ^ 2)
Syy
beta1.hat2 <- Sxy / Sxx
beta1.hat2


# 17/71
model_fit <- lsfit(temp, wind)
ls.print(model_fit)

plot(temp, wind, main = "temp vs wind", pch = 20)
abline(model_fit, col = "red")
text(80, 19, "Regression line:")
text(80, 18, "y = 23.2337 - 0.1705 x")


# 22/71
my_model <- lm(wind ~ temp)
my_model
summary(my_model)

plot(wind ~ temp, main = "airquality")
abline(my_model, col = "red")
text(80, 19, "Regression line:")
text(80, 18, "y = 23.2337 - 0.1705 x")


# 23/71
my_aov <- aov(my_model)
summary(my_aov)

n <- length(wind)
e <- y - yhat
SSE <- sum(e ^ 2)
SSE
MSE <- SSE / (n - 2)
MSE
SSR <- beta1.hat * Sxy
SSR
MSR <- SSR / 1
MSR
SST <- SSR + SSE
SST
Syy
F0 <- MSR / MSE
F0


# 25/71
alpha <- 0.05
se.beta0 <- sqrt(MSE * (1 / n + xbar ^ 2 / Sxx))
se.beta0
tstar <- qt(alpha / 2, n - 1) * se.beta0
CI.beta0 <- beta0.hat + c(tstar, - tstar)
CI.beta0

se.beta1 <- sqrt(MSE / Sxx)
se.beta1
tstar <- qt(alpha / 2, n - 1) * se.beta1
CI.beta1 <- beta1.hat + c(tstar, - tstar)
CI.beta1

confint(my_model)


# 27/71
my_model <- lm(wind ~ temp)
summary(my_model)


# 28/71
my_model <- lm(wind ~ temp)
summary(my_model)

coef(my_model)
vcov(my_model)


#29/71
summary(my_model)[[1]]  # my_model formula
summary(my_model)[[2]]  # attributes of the objects

length(summary(my_model))
names(summary(my_model))
summary(my_model)$sigma
summary(my_model)[[6]]
length(summary(my_model)[[1]])
length(summary(my_model)[[2]])
length(summary(my_model)[[3]])


# 30/71
summary(my_model)[[3]]  # residuals for data points
summary(my_model)[[4]]  # parameters table
summary(my_model)[[4]][[1]]  # intercept
summary(my_model)[[4]][[2]]  # slope,.... summary(my_model)[[4]][[28]]

str(summary(my_model)[[4]])


# 31/71
summary(my_model)[[5]]  # whether the fit should be returned.
summary(my_model)[[6]]  # residual standard error
summary(my_model)[[7]]  # the number of rows in the summary.lm table.
summary(my_model)[[8]]  # r square, the fraction of the total variation in the response variable that is explained by the my_model.
summary(my_model)[[9]]  # adjusted r square
summary(my_model)[[10]]  # F ratio information
summary(my_model)[[11]]  # correlation matrix of the parameter estimates.


# 32/71
my_model <- lm(wind ~ temp)
names(my_model)
my_model$coefficients
my_model$fitted.values
my_model$residuals

summary.aov(my_model)
summary.aov(my_model)[[1]][[1]] 
summary.aov(my_model)[[1]][[5]]


# 33/71
(iris_aov <- aov(iris[, 1] ~ iris[, 5]))
(iris_sum_aov <- summary(iris_aov))
(iris_sum_aov2 <- unlist(iris_sum_aov))
names(iris_sum_aov2)
iris_sum_aov2["Pr(>F)1"]


# 34/71
new_model <- update(my_model, subset = (temp != max(temp)))
summary(new_model)


# 35/71
summary(wind)
summary(temp)
predict(my_model, list(temp = 75))
predict(my_model, list(temp = c(66, 80, 100)))


# 37/71
? plot.lm

wind <- airquality$Wind
temp <- airquality$Temp
my_model <- lm(wind ~ temp)
plot(my_model, which = 1:6)

plot(fitted(my_model), residuals(my_model), xlab = "Fitted values",
     ylab = "Residuals")
abline(h = 0, lty = 2)


# 39/71
qqnorm(residuals(my_model))
qqline(residuals(my_model))


# 44/71
head(swiss)


# 45/71
pairs(swiss, panel  = panel.smooth, main = "swiss data",
      col = 3 + (swiss$Catholic > 50))


# 45/71
summary(my_lm <- lm(Fertility ~ ., data = swiss))


# 47/71
smy_lm <- step(my_lm)


# 48/71
summary(smy_lm)


# 49/71
library(readxl)
Johnson <- read_excel("Johnson.xlsx")
colnames(Johnson) <- c("MonthsSinceLastService", "RepairType", "RepairTime")
print(Johnson)
str(Johnson)


# 51/71
Johnson$RepairType <- factor(Johnson$RepairType, levels = c("Mechanical", "Electrical"))
Johnson$RepairType
Johnson_lm <- lm(RepairTime ~ MonthsSinceLastService + RepairType, data = Johnson)
summary(Johnson_lm)
anova(Johnson_lm)

# 52/71
attach(Johnson)
plot(MonthsSinceLastService, RepairTime, 
     xlab = "Months Since Last Service", 
     ylab = "Repair Time (hours)", 
     main = "Scatter Diagram for the Johnson Filtration Repair Data")
abline(a = Johnson.lm$coefficients[1], b = Johnson.lm$coefficients[2], col = "darkgreen", lwd = 2)
text(7, 3, "Repair Type = \n Mechanical", col = "darkgreen")
abline(a = sum(Johnson.lm$coefficients[c(1, 3)]), b = Johnson.lm$coefficients[2], col = "blue",lwd = 2)
text(4, 4, "Repair Type = \n Exectrical", col = "blue")

cities <- sample(c("Taipei", "Seoul", "Tokyo"), 20, replace = T)
factor(cities)
factor(cities, levels = c("Taipei", "Tokyo", "Seoul"))



# 56/71
# mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
mydata <- read.csv("data/binary.csv")
dim(mydata)
head(mydata)
summary(mydata)
sapply(mydata, sd)

xtabs( ~ admit + rank, data = mydata)
mydata$rank <- factor(mydata$rank)
mydata$rank


# 57/71
pairs(mydata, col = as.integer(mydata$admit) + 2)

ctab <- table(mydata$admit, mydata$rank)
barplot(ctab, beside = TRUE, legend = rownames(ctab),
        ylim = c(0, 120), xlab = "rank")
text(11, 80, labels = "admission")


barplot(t(ctab), beside = TRUE,
        col = c("lightblue", "mistyrose", "lightcyan", "lavender"),
        legend = rownames(t(ctab)),
        ylim = c(0, 120), xlab = "admission")
text(9, 58, labels = "rank")



# 58/71
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)


# 59/71
library(aod) #aod: Analysis of Overdispersed Data
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6)

l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l)


# 60/71
exp(cbind(OR = coef(mylogit), confint(mylogit)))


# 61/71
newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa),
    rank = factor(1:4)))
newdata1

newdata1$rankP <- predict(mylogit, newdata = newdata1, type = "response")
newdata1


# 62/71
newdata2 <- with(mydata, data.frame(
  gre = rep(seq(from = 200, to = 800, length.out = 100), 4),
  gpa = mean(gpa),
  rank = factor(rep(1:4, each = 100))))
dim(newdata2)
head(newdata2)

newdata3 <- cbind(newdata2, predict(mylogit, newdata = newdata2, type = "link", se = TRUE))
newdata3 <- within(newdata3, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))
})
head(newdata3)


# 63/71
library(ggplot2)
ggplot(newdata3, aes(x = gre, y = PredictedProb)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = rank), alpha = 0.2) +
  geom_line(aes(colour = rank), size = 1)


# 64/71
with(mylogit, null.deviance - deviance)
with(mylogit, df.null - df.residual)
with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
logLik(mylogit)


# 65/71
anova(mylogit, test = "Chisq")

drop1(mylogit, test = "Chisq")

library(car) # Companion to Applied Regression
Anova(mylogit) 


# 69/71
head(airquality)
model0 <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality)


# 70/71
cor(airquality[, 1:4], use = "pairwise")
pairs(airquality[, 1:4])

library(car)
vif(model0)


# 68/71
summary(model0)

library(fmsb)
model1 <- lm(Wind ~ Temp + Solar.R, data = airquality)
model2 <- lm(Temp ~ Wind + Solar.R, data = airquality)
model3 <- lm(Solar.R ~ Wind + Temp, data = airquality)
VIF(model0)
sapply(list(model1, model2, model3), VIF)