#################################################
# B01-1: 資料處理與篩選                         #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw                           #
#################################################


# 12/113
begin.experiment <- data.frame(
  name = c("A", "B", "C", "D", "E", "F"),
  weights = c(270, 263, 294, 218, 305, 261)
)
middle.experiment <- data.frame(name = c("G", "H", "I"),
                                weights = c(169, 181, 201))
end.experiment <- data.frame(name = c("C", "D", "A", "H", "I"),
                             weights = c(107, 104, 104, 102, 100))
(common <- intersect(begin.experiment$name, end.experiment$name))
(b.at <- is.element(begin.experiment$name, common))
(e.at <- is.element(end.experiment$name, common))
experiment <- rbind(cbind(begin.experiment[b.at, ], time = "begin"),
                    cbind(end.experiment[e.at, ], time = "end"))
experiment

tapply(experiment$weights, experiment$time, mean)


# 13/113

set.seed(12345)
grade <- as.factor(sample(c("大一", "大二", "大三", "大四"), 50, replace = T))
bloodtype <- as.factor(sample(c("A", "AB", "B", "O"), 50, replace = T))
record <- data.frame(grade, bloodtype)
head(record)
record.t <- table(record)
record.t

as.data.frame(record.t)


# 14/113
margin.table(record.t, 1)
margin.table(record.t, 2)
colSums(record.t)
rowSums(record.t)
colMeans(record.t)
rowMeans(record.t)

prop.table(record.t)
prop.table(record.t, margin = 1) # row margin
prop.table(record.t, margin = 2) # column  margin

set.seed(12345)
(x <- sample(1:10, 5, replace = T))
(y <- tabulate(x))
names(y) <- as.character(1:max(x))
y


# 15/113
Titanic

Titanic.df <- as.data.frame(Titanic)
Titanic.df

xtabs(Freq ~ Sex + Age, data = Titanic.df)
xtabs(Freq ~ Sex + Age,
      data = Titanic.df,
      subset = Class %in% c("1st", "2nd"))


# 16/113
sale <- read.table("itemsale.csv", sep = ",", header = T)
sale

attach(sale)
tb <- xtabs(Count ~ Item + Date)
rbind(cbind(tb, row.total = margin.table(tb, 1)),
      col.total = c(margin.table(tb, 2), sum(tb)))

detach(sale)


# 17/113
library(epitools)
survey <- array(0, dim = c(3, 2, 1))
survey[, 1, 1] <- c(2, 0, 1)
survey[, 2, 1] <- c(3, 2, 4)
Satisfactory <- c("Good", "Fair", "Bad")
Sex <- c("Female", "Male")
Times <- c("First")
dimnames(survey) <- list(Satisfactory, Sex, Times)
names(dimnames(survey)) <- c("Satisfactory", "Sex", "Times")
survey

(survey.ex <- expand.table(survey))


# 18/113
data(HairEyeColor)
HairEyeColor

HairEyeColor.ex <- expand.table(HairEyeColor)
HairEyeColor.ex

as.data.frame(HairEyeColor)


# 19/113
elections <- read.csv('elections-2000.csv')
election
elections.stacked <- cbind(stack(elections[, -1]),
                           county = elections$County)
elections.stacked

plot(elections.stacked[, c(2, 1)])
boxplot(elections[, -1])


# 20/113
mydata <-
  data.frame(Area1 = c("A", "B", "B", "C"),
             Area2 = c("A", "D", "E", "B"))
rownames(mydata) <- paste("rater", 1:4, sep = "-")
mydata
stack(mydata)

mydata.stack <- stack(lapply(mydata, as.character))
colnames(mydata.stack) <-  c("Rate", "Area")
mydata.stack


# 21/113
head(state.x77)
dim(state.x77)
state.region
aggregate(state.x77, list(Region = state.region), mean)


# 22/113
## Compute the averages according to region and the occurrence of more
## than 130 days of frost.
aggregate(
  state.x77,
  by = list(Region = state.region,
            Cold = state.x77[, "Frost"] > 130),
  FUN = function(x) {
    round(mean(x), 2)
  }
)

aggregate(
  state.x77,
  by = list(Region = state.region,
            Cold = state.x77[, "Frost"] > 130),
  FUN = function(x) {
    round(sqrt(sum(x ^ 2)), 2)
  }
)



# 23/113
presidents
aggregate(presidents, nfrequency = 1, FUN = mean)
aggregate(
  presidents,
  nfrequency = 1,
  FUN = weighted.mean,
  w = c(1, 1, 0.5, 1)
)


# 24/113
testDF <- data.frame(
  v1 = c(1, 3, 5, 7, 8, 3, 5, NA, 4, 5, 7, 9),
  v2 = c(11, 33, 55, 77, 88, 33, 55, NA, 44, 55, 77, 99)
)
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <-
  c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF,
          by = list(by1, by2),
          FUN = "mean")
fby1 <- factor(by1, exclude = "")
fby2 <- factor(by2, exclude = "")
aggregate(x = testDF,
          by = list(fby1, fby2),
          FUN = "mean")


# 25/113
aggregate(weight ~ feed, data = chickwts, mean)

summary(chickwts)

aggregate(breaks ~ wool + tension, data = warpbreaks, mean)

summary(warpbreaks)


# 26/113
summary(esoph)
aggregate(cbind(ncases, ncontrols) ~ alcgp + tobgp, data = esoph, sum)


# 27/113
by(iris[, 1:4] , iris$Species , summary)


# 28/113
by(iris[, 1:4] , iris$Species , mean)

mean(iris[iris$Species == "setosa", 1:4])


# 29/113
by(iris[, 1] , iris$Species , mean)

by(iris[, 1:4] , iris$Species , colMeans)


# 30/113
varMean <- function(x, ...)
  sapply(x, mean, ...)
by(iris[, 1:4], iris$Species, varMean)

with(iris, aggregate(iris[, 1:4], list(Species = iris$Species), FUN = mean))


# 31/113
#cut(x, breaks, labels = NULL,
#    include.lowest = FALSE, right = TRUE, dig.lab = 3, ordered_result = FALSE, ...)

x <- rnorm(50)
(x.cut1 <- cut(x, breaks = -5:5))
table(x.cut1)
(x.cut2 <- cut(x, breaks = -5:5, labels = FALSE))
table(x.cut2)
hist(x, breaks = -5:5, plot = FALSE)$counts


# 32/113
#the outer limits are moved away by 0.1% of the range
age <- sample(0:80, 50, replace = T)
summary(age)
cut(age, 5)
mygroup <- c(0, 15, 20, 50, 60, 80)
(x.cut <- cut(age, mygroup))
table(x.cut)


# 33/113
with(iris, {
  iris.lm <- lm(Sepal.Length ~ Petal.Length)
  summary(iris.lm)
})


# 34/113
#merge(x, y, by = intersect(names(x), names(y)),
#      by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all,
#      sort = TRUE, suffixes = c(".x",".y"),
#      incomparables = NULL, ...)

total <- merge(data.frame.A, data.frame.B, by = "ID")
total <- merge(data.frame.A, data.frame.B, by = c("ID", "Country"))


# 37/113
authors <- data.frame(
     surname = I(c("Tukey", "Venables", "Tierney", "Ripley", "McNeil")),
     nationality = c("US", "Australia", "US", "UK", "Australia"),
     deceased = c("yes", rep("no", 4)))
books <- data.frame(
     name = I(c("Tukey", "Venables", "Tierney",
              "Ripley", "Ripley", "McNeil", "R Core")),
     title = c("Exploratory Data Analysis",
               "Modern Applied Statistics ...",
               "LISP-STAT",
               "Spatial Statistics", "Stochastic Simulation",
               "Interactive Data Analysis",
               "An Introduction to R"),
     other.author = c(NA, "Ripley", NA, NA, NA, NA, "Venables & Smith"))
authors 
books 

author <- c("Tukey", "Venables", "Tierney", "Ripley", "McNeil")
(a1 <- data.frame(surname = author))
(a2 <- data.frame(surname = I(author)))

class(a1$surname) # [1] "factor"
class(a2$surname) # [1] "AsIs"
a1$surname
a2$surname


# 38/113
m1 <- merge(authors, books, by.x = "surname", by.y = "name")
(m2 <- merge(books, authors, by.x = "name", by.y = "surname"))

merge(authors,
      books,
      by.x = "surname",
      by.y = "name",
      all = TRUE)


# 39/113
(x <-
    data.frame(
      k1 = c(NA, NA, 3, 4, 5),
      k2 = c(1, NA, NA, 4, 5),
      data = 1:5
    ))
(y <-
    data.frame(
      k1 = c(NA, 2, NA, 4, 5),
      k2 = c(NA, NA, 3, 4, 5),
      data = 1:5
    ))
merge(x, y, by = c("k1", "k2")) # NA's match
merge(x, y, by = "k1") # NA's match, so 6 rows
merge(x, y, by = "k2", incomparables = NA) # 2 rows


# 40/113
stories <- read.table(header = TRUE,
                      text = '
    storyid  title
     1       lions
     2      tigers
     3       bears
 ')
data <- read.table(
  header = TRUE,
  text = '
     subject storyid rating
           1       1    6.7
           1       2    4.5
           1       3    3.7
           2       2    3.3
           2       3    4.1
           2       1    5.2
 '
)
merge(stories, data, by = "storyid")

stories2 <- read.table(header = TRUE,
                       text = '
    id       title
     1       lions
     2      tigers
     3       bears
 ')
merge(stories2, data, by.x = "id", by.y = "storyid")


# 41/113
animals <- read.table(
  header = T,
  text = '
    size type         name
   small  cat         lynx
     big  cat        tiger
   small  dog    chihuahua
     big  dog "great dane"
 '
)
observations <- read.table(
  header = T,
  text = '
    number  size type
         1   big  cat
         2 small  dog
         3 small  dog
         4   big  dog
 '
)
merge(observations, animals, c("size", "type"))


# 42/113
#split(x, f, drop = FALSE, ...)
#split(x, f, drop = FALSE, ...) <- value
#unsplit(value, f, drop = FALSE)

n <- 10
edu <- factor(sample(1:4, n, replace = T))
score <- sample(0:100, n)
cbind(edu, score)

score.edu <- split(score, edu)
score.edu

unsplit(score.edu, edu)
sort(edu)
unsplit(score.edu, sort(edu))


# 43/113
head(airquality)
month <- airquality$Month
airquality.month <- split(airquality, month)
mydata <- lapply(airquality.month, transform,
                 Oz.Z = scale(Ozone))
airquality2 <- unsplit(mydata, month)
head(airquality2)

airquality.month

transform(airquality, Ozone = -Ozone)
transform(airquality, new = -Ozone, Temp = (Temp - 32) / 1.8)

attach(airquality)
transform(Ozone, logOzone = log(Ozone))


# 44/113
split(1:10, 1:2)

mat <- cbind(x = 1:10, y = (-4:5) ^ 2)
cbind(mat, col(mat))
split(mat, col(mat))


# 45/113
(x <- matrix(1:24, nrow = 4))

#1: rows, 2:columns
apply(x, 1, sum)

#apply function to the individual elements
apply(x, 1, sqrt)

apply(x, 2, sqrt)


# 46/113
# generate score data
math <- sample(1:100, 50, replace = T)
english <- sample(1:100, 50, replace = T)
algebra <- sample(1:100, 50, replace = T)
ScoreData <- cbind(math, english, algebra)
head(ScoreData, 5)

sdata1 <- apply(ScoreData, 2, myfun)
head(sdata1, 5)

head(apply(ScoreData, 2, function(x)
  sqrt(x) * 10), 5)

sdata2 <- apply(ScoreData, 2, myfun2, attend = 5)
head(sdata2, 5)


# 47/113
set.seed(12345)
scores <- sample(0:100, 50, replace = T)
grade <- as.factor(sample(c("大一", "大二", "大三", "大四"), 50, replace = T))
bloodtype <- as.factor(sample(c("A", "AB", "B", "O"), 50, replace = T))
tapply(scores, grade, mean)

tapply(scores, bloodtype, mean)

summary(warpbreaks[, -1])

tapply(warpbreaks$breaks, warpbreaks[, -1], sum)


# 48/113
n <- 20
(my.factor <- factor(rep(1:3, length = n), levels = 1:5))

table(my.factor)

tapply(1:n, my.factor, range)

presidents
class(presidents)

# gives the positions in the cycle of each observation.
cycle(presidents)

tapply(presidents, cycle(presidents), mean, na.rm = T)


# 49/113
a <- c("a", "b", "c", "d")
b <- c(1, 2, 3, 4, 4, 3, 2, 1)
c <- c(T, T, F)
list.object <- list(a, b, c)
my.la1 <- lapply(list.object, length)
my.la1

my.la2 <- lapply(list.object, class)
my.la2

x <-
  list(
    a = 1:10,
    beta = exp(-3:3),
    logic = c(TRUE, FALSE, FALSE, TRUE)
  )
lapply(x, mean) # return list

# median and quartiles for each list element
lapply(x, quantile, probs = 1:3 / 4)


# 50/113
x <-
  list(
    a = 1:10,
    beta = exp(-3:3),
    logic = c(TRUE, FALSE, FALSE, TRUE)
  )
sapply(x, mean)  # return vector

sapply(x, quantile)

lapply(x, quantile)


# 51/113
(i37 <- sapply(3:7, seq))

sapply(i37, fivenum)

(v <- structure(10 * (5:7), names = LETTERS[1:3]))

f2 <- function(x, y)
  outer(rep(x, length.out = 3), y)
sapply(v, f2, y = 1:4, simplify = "array")

sapply(v, f2, y = 1:4)


# 52/113
my.list <- list(
  name = c("George", "John", "Tom"),
  wife = c("Mary", "Sue", "Nico"),
  no.children = c(3, 2, 0),
  child.ages = list(c(4, 7, 9), c(2, 5), NA)
)

# 取出某一家庭的資訊
my.list[[1]][1]
my.list[[3]][1]
my.list[[4]][1]

my.list[[1:4]][1] # Error

George.family <- sapply(my.list, "[[", 1)
George.family


# 53/113
mapply(rep, 1:4, 4:1)
mapply(rep, times = 1:4, x = 4:1)

mapply(rep, times = 1:4, MoreArgs = list(x = 42))
mapply(function(x, y)
  seq_len(x) + y,
  c(a =  1, b = 2, c = 3),
  c(A = 10, B = 0, C = -10))


# 54/113
word <- function(C, k)
  paste(rep.int(C, k), collapse = "")
mapply(word, LETTERS[1:6], 6:1, SIMPLIFY = FALSE)
str(mapply(word, LETTERS[1:6], 6:1, SIMPLIFY = FALSE))


# 56/113
mydata <- list(list(a = pi, b = list(c = 1:1)), d = "a test")
mydata

rapply(mydata, sqrt, classes = "numeric", how = "replace")

rapply(mydata,
       nchar,
       classes = "character",
       deflt = as.integer(NA),
       how = "list")

rapply(mydata,
       nchar,
       classes = "character",
       deflt = as.integer(NA),
       how = "unlist")

# 58/113
browseVignettes(package = "dplyr")

x <- rnorm(10)
x %>% max
# is the same thing as:
max(x)


# 59/113
library(nycflights13)
dim(flights)
head(flights, 4)


# 60/113
library(dplyr)
filter(flights, month == 1, day == 1)   #same as filter(flights, month == 1 & day == 1)

flights[flights$month == 1 & flights$day == 1,]

table(flights$carrier)

filter(flights, carrier %in% c("OO", "YV"))


# 61/113
subset(flights, dep_delay < 0, select = c(carrier, distance))

subset(flights, origin == "JFK", select = -year)

airquality.sub1 <- subset(airquality, Day == 1, select = -Temp)

head(airquality, 3)

airquality.sub2 <-
  subset(airquality, Temp > 80, select = c(Ozone, Temp))
head(airquality.sub2, 3)

airquality.sub3 <- subset(airquality, select = Ozone:Wind)


# 62/113
#  same as flights[order(flights$month, flights$day, flights$distance), ]
arrange(flights, month, day, distance)

# same as flights[order(flights$carrier, desc(flights$arr_delay)), ]
arrange(flights, carrier, desc(arr_delay))


# 64/113
select(flights, origin, carrier, distance) #Select columns by name

select(flights, dep_time:arr_delay) #Select all columns between variables (inclusive)

select(flights,-(year:day)) # Select all columns except those from year to day (inclusive)


# 65/113
select(flights, tail_num = tailnum) #drops all the variables not explicitly mentioned

rename(flights, DepTime = dep_time, DepDelay = dep_delay)


# 66/113
iris <- tbl_df(iris) # little nicer for printing
head(iris, 3)

select(iris, matches(".t."))

select(iris, starts_with("Petal"))

select(iris, ends_with("Width"))

select(iris, contains("etal"))

select(iris, Petal.Length, Petal.Width)

vars <- c("Petal.Length", "Petal.Width")
select(iris, one_of(vars))


# 67/113
select(iris,-starts_with("Petal"))  # Drop variables

select(iris,-ends_with("Width"))

select(iris,-contains("etal"))

select(iris,-matches(".t."))

select(iris,-Petal.Length,-Petal.Width)


# 68/113
distinct(select(flights, tailnum))

distinct(select(flights, origin, dest))


# 69/113
mutate(flights,
       gain = arr_delay - dep_delay,
       speed = distance / air_time * 60)

mutate(flights,
       gain = arr_delay - dep_delay,
       gain_per_hour = gain / (air_time / 60))

transform(flights,
          gain = arr_delay - dep_delay,
          gain_per_hour = gain / (air_time / 60))

transmute(flights,
          gain = arr_delay - dep_delay,
          gain_per_hour = gain / (air_time / 60))


# 70/113
head(mtcars, 3)

group_by(mtcars, cyl)

summarise(group_by(mtcars, cyl),
          m = mean(disp),
          sd = sd(disp))

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

mean(flights$dep_delay, na.rm = T)


# 71/113
sample_n(iris, 5)

sample_frac(iris, 0.05)


# 72/113
by_tailnum <- group_by(flights, tailnum)
by_tailnum

delay <- summarise(
  by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay

delay <- filter(delay, count > 20, dist < 2000)
delay


# 73/113
ggplot(delay, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1 / 2) +
  geom_smooth() +
  scale_size_area()


# 74/113
destinations <- group_by(flights, dest)
summarise(destinations,
          planes = n_distinct(tailnum),
          flights = n())


# 75/113
daily <- group_by(flights, year, month, day)
(per_day  <- summarise(daily, flights = n()))

per_month <- summarise(per_day, flights = sum(flights))

(per_year  <- summarise(per_month, flights = sum(flights)))


# 76/113
a1 <- group_by(flights, year, month, day)
a2 <- select(a1, arr_delay, dep_delay)
a3 <- summarise(a2,
                arr = mean(arr_delay, na.rm = TRUE),
                dep = mean(dep_delay, na.rm = TRUE))
a4 <- filter(a3, arr > 30 | dep > 30)

filter(summarise(
  select(group_by(flights, year, month, day),
         arr_delay, dep_delay),
  arr = mean(arr_delay, na.rm = TRUE),
  dep = mean(dep_delay, na.rm = TRUE)
),
arr > 30 | dep > 30)

flights %>%
  group_by(year, month, day) %>%
  select(arr_delay, dep_delay) %>%
  summarise(arr = mean(arr_delay, na.rm = TRUE),
            dep = mean(dep_delay, na.rm = TRUE)) %>%
  filter(arr > 30 | dep > 30)


# 80/113
preg <-
  read.table(
    "preg.txt",
    sep = "\t",
    header = T,
    stringsAsFactors = FALSE
  )
preg

preg2 <-
  read.table(
    "preg2.txt",
    sep = "\t",
    header = T,
    stringsAsFactors = FALSE
  )
preg2

library(tidyr)
library(dplyr)
preg.tidy <- preg %>%
  gather(treatment, n, treatmenta:treatmentb) %>%
  mutate(treatment = gsub("treatment", "", treatment)) %>%
  arrange(name, treatment)
preg.tidy


# 82/113
gather(data, key, value, ..., na.rm = FALSE, convert = FALSE)

xdata <- data.frame(Group = letters[1:4], matrix(rnorm(12), ncol = 3))
xdata

gather(xdata, key = KEY, value = VALUE,-Group)


# 83/113
gather(xdata, key = KEY, value = VALUE)

gather(xdata, key = KEY, value = VALUE, Group)

gather(xdata, key = KEY, value = VALUE, X1)

gather(xdata, key = KEY, value = VALUE, X1, X2)

gather(xdata, key = KEY, value = VALUE, X1:X3)


# 84/113
(mini_iris <- iris[c(1, 51, 101),])

gather(mini_iris,
       key = flower_att,
       value = measurement,
       Sepal.Length:Petal.Width)

gather(mini_iris, key = flower_att, value = measurement,-Species)


# 85/113
pew <-
  tbl_df(read.csv(
    "pew.csv",
    stringsAsFactors = FALSE,
    check.names = FALSE
  ))
pew

pew %>%
  gather(income, frequency,-religion)


# 86/113
stocks

# stocks %>% gather(stock, price, -time)
stocks.ga <- gather(stocks, key = stock,
                    value = price,-time)
stocks.ga

stocks <- data.frame(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4)
)


# 87/113
billboard <-
  tbl_df(read.csv("billboard.csv", stringsAsFactors = FALSE))
billboard


# 88/113
billboard2 <- billboard %>%
  gather(week, rank, wk1:wk76, na.rm = TRUE)
billboard2


# 89/113
billboard3 <- billboard2 %>%
  mutate(week = extract_numeric(week),
         date = as.Date(date.entered) + 7 * (week - 1)) %>%
  select(-date.entered)
billboard3


# 90/113
billboard3 %>% arrange(artist, track, week)

billboard3 %>% arrange(date, rank)


# 91/113
separate(
  data,
  col,
  into,
  sep = "[^[:alnum:]]+",
  remove = TRUE,
  convert = FALSE,
  extra = "warn",
  fill = "warn",
  ...
)


# 92/113
(df <- data.frame(x = c(NA, "a.b", "a.d", "b.c")))

df %>% separate(x, c("X1", "X2"))

# separate(df, x, c("X1", "X2"))
(df <- data.frame(x = c("a", "a b", "a b c", NA)))

df %>% separate(x, c("X1", "X2"))

(df %>% separate(x, c("X1", "X2"), extra = "drop", fill = "right"))


# 93/113
df

df %>% separate(x, c("X1", "X2"), extra = "merge", fill = "left")

(df <- data.frame(x = c("x: 123", "y: error: 7")))

df %>% separate(x, c("Key", "Value"), ": ", extra = "merge")


# 94/113
tb <- tbl_df(read.csv("tb.csv", stringsAsFactors = FALSE))
tb


# 95/113
tb2 <- tb %>% gather(demo, n,-iso2,-year, na.rm = TRUE)
tb2

tb3 <- tb2 %>% separate(demo, c("sex", "age"), 1)
tb3


# 96/113
spread(data,
       key,
       value,
       fill = NA,
       convert = FALSE,
       drop = TRUE)

(df <- data.frame(
  x = c("a", "b"),
  y = c(3, 4),
  z = c(5, 6)
))

df %>% spread(x, y)

df %>% spread(x, y) %>% gather(x, y, a:b, na.rm = TRUE)


# 97/113
stocks

stocks.ga <- stocks %>% gather(company, price,-time)
stocks.ga

stocks.ga %>% spread(company, price)

stocks.ga %>% spread(time, price)

stocks <- data.frame(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4)
)


# 98/113
weather <- tbl_df(read.csv("weather.csv", stringsAsFactors = FALSE))
weather


# 99/113
# Tidy the dataset by gathering the day columns
weather2 <- weather %>% gather(day, value, d1:d31, na.rm = TRUE)
weather2

# do a little cleaning
weather3 <- weather2 %>%
  mutate(day = extract_numeric(day)) %>%
  select(id, year, month, day, element, value) %>%
  arrange(id, year, month, day)
weather3


# 100/113
weather3 %>% spread(element, value)


# 101/113
billboard3


# 102/113
#First extract a song dataset
song <- billboard3 %>%
  select(artist, track, year, time) %>%
  unique() %>%
  mutate(song_id = row_number())
song


# 103/113
rank <- billboard3 %>%
  left_join(song, c("artist", "track", "year", "time")) %>%
  select(song_id, date, week, rank) %>%
  arrange(song_id, date)
rank


# 104/113
library(data.table)
# create a data.table object
id <- letters[1:6]
g <- sample(c("F", "M"), 6, replace = T)
s <- sample(0:100, 6, replace = T)
p <- sample(c("T", "F"), 6, replace = T)
DT <- data.table(
  ID = id,
  gender = g,
  score = s,
  pass = p
)
str(DT)


# 105/113
flights <- fread("flights14.csv")
# Get all the flights with “JFK” origin airport in June
ans <- flights[origin == "JFK" & month == 6L]
head(ans)

# flights[flights$origin == "JFK" & flights$month == 6L, ] OK
# flights[origin == "JFK" & month == 6L, ] OK

# Get the first two rows from flights
ans <- flights[1:2]
ans

# Sort flights by origin in ascending order, and then by dest in descending order
ans <- flights[order(origin,-dest)]
head(ans)


# 106/113
# Select arr_delay column, return a "vector".
ans <- flights[, arr_delay]
head(ans)

# Select arr_delay column, return a "data.table"
ans <- flights[, list(arr_delay)]
head(ans)

ans <- flights[, list(arr_delay, dep_delay)]
# same as flights[, .(arr_delay, dep_delay)]
head(ans)

# Select both arr_delay and dep_delay columns and rename
ans <- flights[, .(X1 = arr_delay, X2 = dep_delay)]
head(ans)


# 107/113
# How many trips have had total delay < 0?
ans <- flights[, sum((arr_delay + dep_delay) < 0)]
ans

# Calculate the average arrival and departure delay for all flights with
#“JFK” as the origin airport in the month of June.
ans <- flights[origin == "JFK" & month == 6L,
               .(m.arr = mean(arr_delay), m.dep = mean(dep_delay))]
ans

# How many trips have been made in 2014 from “JFK” airport in the month of June?
ans <- flights[origin == "JFK" & month == 6L, length(dest)]
ans

# .N: the number of observations in the current group.
ans <- flights[origin == "JFK" & month == 6L, .N]
ans

# inefficient: nrow(flights[origin == "JFK" & month == 6L])


# 108/113
# Select both arr_delay and dep_delay columns (like in a data.frame)
ans <- flights[, c("arr_delay", "dep_delay"), with = FALSE]
head(ans)

# returns all columns except arr_delay and dep_delay
ans <- flights[,!c("arr_delay", "dep_delay"), with = FALSE]
# or
ans <- flights[,-c("arr_delay", "dep_delay"), with = FALSE]
head(ans)

# returns year, month and day
ans <- flights[, year:day, with = FALSE]
# returns day, month and year
ans <- flights[, day:year, with = FALSE]
head(ans)


#109/113
ans <- flights[, .N, by = origin]
# flights[, .(.N), by = "origin"] OK
ans

# the number of trips for each origin airport for carrier code “AA”
ans <- flights[carrier == "AA", .N, by = origin]
ans


# the total number of trips for each origin, dest pair for carrier code “AA”?
ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
# flights[carrier == "AA", .N, by = c("origin", "dest")] OK
head(ans)


# the average arrival and departure delay for each orig, dest pair for each month for carrier code “AA”?
ans <- flights[carrier == "AA",
               .(mean(arr_delay), mean(dep_delay)),
               by = .(origin, dest, month)]
ans


# 110/113
# increasingly order by all the grouping variables (keyby)
ans <- flights[carrier == "AA",
               .(mean(arr_delay), mean(dep_delay)),
               keyby = .(origin, dest, month)]
ans

# Chaining: forming a chain of operations, DT[ ... ][ ... ][ ... ]
# order ans using the columns origin in ascending order, and dest in descending order
ans <-
  flights[carrier == "AA", .N, by = .(origin, dest)][order(origin,-dest)]
# same as
# ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
# ans[order(origin, -dest)]
head(ans)

# Expressions in by
# how many flights started late but arrived early (or on time), started and arrived late etc…
ans <- flights[, .N, .(dep_delay > 0, arr_delay > 0)]
ans


# 111/113
DT <- data.table(
  ID = c("b", "b", "b", "a", "a", "c"),
  X1 = 1:6,
  X2 = 7:12,
  X3 = 13:18
)
DT

DT[, print(.SD), by = ID]


# 112/113
# compute means on multiple columns for each groups
DT[, lapply(.SD, mean), by = ID]


ans <- flights[, head(.SD, 2), by = month]
head(ans)


# 113/113
DT

# concatenate columns a and b for each group in ID
DT[, .(NewX = c(X1, X2)), by = ID]

DT[, .(NewX = list(c(X1, X2))), by = ID]

DT[, print(c(X1, X2)), by = ID]

DT[, print(list(c(X1, X2))), by = ID]
