ASA的美國總統競選 在這個大選之年,美國統計協會(ASA)將學生競賽和總統選舉放在一起,將學生預測誰是2016年總統大選的贏家準確的百分比作為比賽點。詳情見: http://thisisstatistics.org/electionprediction2016/ 獲取數據 互聯網上有很多公開的民調 ...
ASA的美國總統競選
在這個大選之年,美國統計協會(ASA)將學生競賽和總統選舉放在一起,將學生預測誰是2016年總統大選的贏家準確的百分比作為比賽點。詳情見:
http://thisisstatistics.org/electionprediction2016/
獲取數據
互聯網上有很多公開的民調數據。可以下麵的網站獲取總統大選的相關數據:
http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/
其他較好的數據源是:
http://www.realclearpolitics.com/epolls/latest_polls/
http://elections.huffingtonpost.com/pollster/2016-general-election-trump-vs-clinton
http://www.gallup.com/products/170987/gallup-analytics.aspx)
值得註意的是:數據是每天更新的,所以你在看本文的時候很可能數據變化而得到不同的結果。
因為原始的數據是JSON文件,R拉取下來將其作為了lists中的一個list(列表)。
原文的Github地址:https://github.com/hardin47/prediction2016/blob/master/predblog.Rmd
##載入需要的包 require(XML) require(dplyr) require(tidyr) require(readr) require(mosaic) require(RCurl) require(ggplot2) require(lubridate) require(RJSONIO) ##數據拉取 url = "http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/" doc <- htmlParse(url, useInternalNodes = TRUE) #爬取網頁內容 sc = xpathSApply(doc, "//script[contains(., 'race.model')]", function(x) c(xmlValue(x), xmlAttrs(x)[["href"]])) jsobj = gsub(".*race.stateData = (.*);race.pathPrefix.*", "\\1", sc) data = fromJSON(jsobj) allpolls <- data$polls #unlisting the whole thing indx <- sapply(allpolls, length) pollsdf <- as.data.frame(do.call(rbind, lapply(allpolls, 'length<-', max(indx)))) ##數據清洗 #unlisting the weights pollswt <- as.data.frame(t(as.data.frame(do.call(cbind, lapply(pollsdf$weight, data.frame, stringsAsFactors=FALSE))))) names(pollswt) <- c("wtpolls", "wtplus", "wtnow") row.names(pollswt) <- NULL pollsdf <- cbind(pollsdf, pollswt) #unlisting the voting indxv <- sapply(pollsdf$votingAnswers, length) pollsvot <- as.data.frame(do.call(rbind, lapply(pollsdf$votingAnswers, 'length<-', max(indxv)))) pollsvot1 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V1, data.frame, stringsAsFactors=FALSE)))) pollsvot2 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V2, data.frame, stringsAsFactors=FALSE)))) pollsvot1 <- cbind(polltype = rownames(pollsvot1), pollsvot1, polltypeA = gsub('[0-9]+', '', rownames(pollsvot1)), polltype1 = extract_numeric(rownames(pollsvot1))) pollsvot1$polltype1 <- ifelse(is.na(pollsvot1$polltype1), 1, pollsvot1$polltype1 + 1) pollsvot2 <- cbind(polltype = rownames(pollsvot2), pollsvot2, polltypeA = gsub('[0-9]+', '', rownames(pollsvot2)), polltype1 = extract_numeric(rownames(pollsvot2))) pollsvot2$polltype1 <- ifelse(is.na(pollsvot2$polltype1), 1, pollsvot2$polltype1 + 1) pollsdf <- pollsdf %>% mutate(population = unlist(population), sampleSize = as.numeric(unlist(sampleSize)), pollster = unlist(pollster), startDate = ymd(unlist(startDate)), endDate = ymd(unlist(endDate)), pollsterRating = unlist(pollsterRating)) %>% select(population, sampleSize, pollster, startDate, endDate, pollsterRating, wtpolls, wtplus, wtnow) allpolldata <- cbind(rbind(pollsdf[rep(seq_len(nrow(pollsdf)), each=3),], pollsdf[rep(seq_len(nrow(pollsdf)), each=3),]), rbind(pollsvot1, pollsvot2)) allpolldata <- allpolldata %>% arrange(polltype1, choice)
查看所有的選擇數據:allolldata