#################################################
# G01: R網路爬蟲                                #
# 吳漢銘 國立政治大學統計學系                   #
# https://hmwu.idv.tw/                          #
#################################################


# 6/44
install.packages("rvest")
library(rvest)
html <- read_html("http://www.imdb.com/title/tt1490017/")
cast <- html_nodes(html, "span.itemprop")
html_text(cast)

cast <- html_nodes(html, "#titleCast .itemprop")
html_text(cast)

cast <- html_nodes(html, "#titleCast span.itemprop")
html_text(cast)


# 11/44
lego.movie <- read_html("http://www.imdb.com/title/tt1490017/")
rating  <- html_nodes(lego.movie, "strong span") %>% html_text() 
rating

poster <- lego.movie %>%
          html_nodes(".poster img") %>%
          html_attr("src")
poster


# 13/44
# install.packages("tmcn", repos="http://R-Forge.R-project.org")
# "tmcn"為Windows上用來處理中文字的套件。mac, linux上不需安裝
library(tmcn) 
#html <- read_html("http://news.ltn.com.tw/list/BreakingNews") 
html <- read_html("http://news.ltn.com.tw/list/breakingnews")

#news.title <- html_nodes(html, ".picword") 
news.title <- html_nodes(html, ".ph") 
html_attrs(news.title)

#news.title.utf8 <- toUTF8(html_text(news.title)) # convert to UTF8
news.title.utf8 <- toUTF8(html_attr(news.title, "data-desc"))

title.href <- html_attr(news.title, "href")  
my.news <- data.frame(title = news.title.utf8, href = title.href, stringsAsFactors=FALSE)
my.news                                              


# 14/44
#appledaily <- "http://www.appledaily.com.tw"
#url.main <- paste0(appledaily, "/realtimenews/section/new/")

url.main <- "https://tw.appledaily.com/new/realtime"
apple.news <- read_html(url.main)

# news.rddt <- html_nodes(apple.news, '.rtddt')
# news.time <- html_text(html_nodes(news.rddt, 'time'))

news.time <- html_text(html_nodes(apple.news, '.rtddt time'))
news.title <- html_text(html_nodes(apple.news, '.rtddt h1'))
news.category <- html_text(html_nodes(apple.news, '.rtddt h2'))
news.url <- html_attr(html_nodes(apple.news, '.rtddt a'), 'href')
#realtimenews <- data.frame(time=news.time, title=news.title, 
#                 category=news.category, url= paste0(appledaily, news.url))
realtimenews <- data.frame(time=news.time, title=news.title, 
                           category=news.category, url= news.url)
realtimenews


# 15/44
get_content <- function(x){
   #tag <- html_node(read_html(x), '.trans')
   tag <- html_node(read_html(x), '.ndArticle_margin p')
   text <- html_text(tag) 
   text
}

get_content(as.character(realtimenews$url[1]))

url.c <- as.character(realtimenews$url)
news.content <- sapply(url.c, get_content)


# 18/44
url.main <- 'https://www.ptt.cc/bbs/R_Language/index.html'
href.title <- html_nodes(read_html(url.main), ".title a") 
R.hrefs <- html_attr(href.title, 'href')

R.article.data <- c()
for(i in 1:length(R.hrefs)){
   article.url <- paste0('https://www.ptt.cc', R.hrefs[i])
   article <- html_nodes(read_html(article.url), "#main-content") 
   article.content <- html_text(article)
   article.utf8 <- iconv(article.content, 'utf8')
   R.article.data <- c(R.article.data, article.utf8)
   Sys.sleep(sample(3:5, 1))
}
R.article.data


# 19/44
# readHTMLTable {XML} failed
install.packages("XML")
library{XML}
install.packages("RCurl") #RCurl: General Network (HTTP/FTP/...) Client Interface for R
library(RCurl) 
url <- getURL("http://lishi.tianqi.com/taibei/201701.html")
htmlTable1 <- readHTMLTable(url, header=T)
?readHTMLTable
htmlTable1

# 20/44
library(stringr)
library(tmcn)
# don't use for tables that contain missing values
get_tianqi_table <- function(city="taibei", year.from=2015, year.to=2016){

  month <- c(paste0("0", 1:9), "10", "11", "12")
  url.list <- c()
  for (year in year.from:year.to){
    url <- paste0("http://lishi.tianqi.com/taibei/", year, month, ".html")
    url.list <- c(url.list, url)
  }
  
  tianqi.table <- NULL
  for(x in url.list){
    # x <- "http://lishi.tianqi.com/taibei/201701.html"
    html <- read_html(x)
    wdata <- html_text(html_nodes(html, '.tqtongji2'))
    content <- toTrad(wdata)
    #content.tmp <- str_replace_all(content, "[\r\n\t]", "")
    content.tmp <- str_replace_all(content, "[\r\n\t\t]", "")
    content.tmp2 <- strsplit(str_trim(content.tmp), "\\s+")[[1]]
    tmp <- as.data.frame(matrix(content.tmp2[-(1:6)], ncol=6, byrow=T))
    colnames(tmp) <- content.tmp2[1:6]
    tianqi.table <- rbind(tianqi.table, tmp)
  }
  tianqi.table
}

get_tianqi_table(city="taibei", year.from=2016, year.to=2016)


# 23/44
install.packages("quantmod")
library(quantmod)
getSymbols("AAPL", src="yahoo")

# Data is loaded silently without user assignment by default. 
# The assigned variable name is that of the respective Symbols value. 
head(AAPL, 3)

# Open: 當日開盤價; High: 當日最高價; Low: 當日最低價; Close: 當日收盤價; Volume: 成交量
# Adjust: 調整後股價; # 開高低收: OHLC
# or
myApple <- get(getSymbols("AAPL", src="yahoo"))
head(myApple, 3)

class(AAPL)

AAPL.df <-  as.data.frame(AAPL)
class(AAPL.df)


# 24/44
chartSeries(AAPL)
chartSeries(AAPL, subset='2016-09::2016-12')
chartSeries(AAPL, subset='2016-09::2016-12', type = "line", theme = "white")


# 25/44
start.date <- as.Date("2012-02-01")
end.date <- as.Date("2012-02-28")
APPL.201202  <-  AAPL[index(AAPL) > start.date & index(AAPL) < end.date]
head(APPL.201202, 3)

tail(APPL.201202, 3)

AAPL.201604 <- get(getSymbols("AAPL", from = as.Date("2016-04-01"), to = as.Date("2016-04-30"), src="yahoo"))
# note: "AAPL" is changed
index(AAPL.201604)

# multiple stocks
getSymbols(c("VZ", "AAPL", "MMM", "IBM"))

head(MMM, 3)



# 26/44
getwd()

# 儲存資料(Rdata格式)
# saveSymbols(file.path=".") # save all Symbols in the current directory
saveSymbols(c("AAPL", "MMM"), file.path=".")
list.files()

# 刪除此R session的Symbols資料 # removeSymbols() 刪除全部
removeSymbols("AAPL")
# 讀取在電腦端的資料(Rdata格式)
load("AAPL.RData")

# getSymbols取得的股票資料儲存成csv檔
write.zoo(IBM, "IBM.csv", sep = ",", qmethod = "double") # or
# 讀取股票資料csv檔進入R，成為xts類別物件  (假設工作目錄下已有IBM.csv檔案)
removeSymbols("IBM") # > showSymbols()
getSymbols.csv("IBM", env=globalenv())

head(IBM, 3)

# 以data.frame儲存成csv檔
write.csv(as.data.frame(IBM), "IBM_2.csv")
getSymbols.csv('IBM_2', env=globalenv())


# 27/44
symbols <- stockSymbols()  # Get investment data from the internet

head(symbols)


# 28/44
TSMC <- getSymbols("2330.TW", auto.assign = FALSE)
head(TSMC)

# Must use auto.assign=TRUE for multiple Symbols requests
getSymbols(c("2330.TW", "2303.TW", "2337.TW"))

head(2330.TW) # error

head("2330.TW") # wrong

head('2330.TW') # wrong, Enter鍵旁的「'」

head(`2330.TW`) # correct, Tab 鍵上方的「`」

TSMC <- get("2330.TW") # or  TSMC <- `2330.TW`
UMC <- get("2303.TW")
MXIC <- get("2337.TW")
head(UMC) # View(UMC)


# getYahooData is deprecated and will be removed in a future release.
# 30/44
TSMC.2012.daily <- getYahooData("2330.TW", start = 20120101, end = 20121231)
head(TSMC.2012.daily)

TSMC.2012.monthly <- getYahooData("2330.TW", start = 20120101, end = 20121231, freq = "monthly", adjust = F)
head(TSMC.2012.monthly)

# https://finance.yahoo.com/world-indices
# e.g., S&P 500 (^GSPC), Dow Jones Industrial Average (^DJI)
# 抓取台灣加權指數 TSEC weighted index (^TWII) 歷史資料
getSymbols("^TWII", src="yahoo")

head(TWII)
chartSeries(TWII)