用sparkR，分析上億條訂單數據的腳本。

上周我們這個10人的小團隊開發的推薦拉新系統，日拉新人數已接近4萬人。過去幾個月這個系統從無到有，拉新從日增幾千穩步增長到日增幾萬，同事們幾個月來，每天工作13個小時以上，洗澡時間都沒有，有時就住在公司，回家怕吵到家人，只能睡客廳地板，周日也不能保證休息。大家的全力投入，不懈努力才能有 ...

上周我們這個10人的小團隊開發的推薦拉新系統，日拉新人數已接近4萬人。過去幾個月這個系統從無到有，拉新從日增幾千穩步增長到日增幾萬，同事們幾個月來，每天工作13個小時以上，洗澡時間都沒有，有時就住在公司，回家怕吵到家人，只能睡客廳地板，周日也不能保證休息。大家的全力投入，不懈努力才能有這個結果。非常感慨團隊產生的的化學反應，和驚人的生產效率。產品穩定後，最近全面轉入大數據分析，和機器學習階段，開始做真正的增長黑客實踐。 spark， R， scala都是剛剛開始深入地學習，沒幾天，還好有數據，學的快！，不休息，連做夢都是在做分析數據的工作，日進千里啊。

剛開始用spark-sql的時候，如果做一個複雜的查詢，寫一長串sql，誰都看不懂，拆成小sql，就要保存中間結果，效率低下。用了幾天後，開始切入sparkR和Scala , 發現效率比直接用spark－sql高太多了，代碼可讀性也強太多。此外善用cahe，也可以有效提高效率。

下麵都是乾貨。廢話不多少，只希望幫到你。

工作目標：分析一下新手券分享的拉新效果和人數，需要對最近15日的訂單大概2億多條訂單紀錄，以及300萬左右的領券紀錄，幾十萬筆的返利信息做全庫查詢，這在msql上是不可能完成的任務。對spark＋hive來說，也很耗時，但一個小時內可以搞定。

用R寫了一下查詢腳本，稍後準備改成scala的。兩者都是調用spark api，區別應該只在語法上。

用15個節點的spark跑這個查詢腳本，大概需要半個多小時才能出來結果。代碼是最完整，最準確的文檔，提綱挈領的總結以後得空再總結。

############################statistics.R################################

#領券日期參數, 修改統計日參數
date_parameter <- "2016-07-11"
dayCount_parameter = 1

hiveContext <- sparkRHive.init(sc)
sql(hiveContext, "use honeycomb_bh_db")

#通過hiveSql 獲得想要的並集集合併且緩存下來 sql date_add
##程式執行階段1: 數據準備。。。。。
acquired_users_sql <-"select * from sc_t_acquire_record where sc_t_acquire_record.year=2016 and sc_t_acquire_record.month=07 and to_date(ct_time)='STARTDATE'"
all_order_sql <- "select * from sc_t_order_all_info As a where a.year=2016 and a.month=07 and to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),14) and product_id=210"
rebate_order_sql <- "select * from sc_t_order_rebate_info As a where a.year=2016 and a.month=07 and to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),7) and product_id=210"

acquired_users_sql<-sub(pattern='STARTDATE', replacement=date_parameter, acquired_users_sql)
all_order_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, all_order_sql)
rebate_order_sql<-gsub(pattern='STARTDATE', replacement=date_parameter, rebate_order_sql)

#當天領券綁定的用戶集合
acquired_users <-sql(hiveContext,acquired_users_sql)
cache(acquired_users)

#15日內的全訂單集合
all_orders <-sql(hiveContext,all_order_sql)

#7日內返利的訂單集合
rebated_orders <- sql(hiveContext,rebate_order_sql)

#第0日領券後到14日結束前，有打車紀錄的
acquired_users_with_orders<-join(acquired_users,all_orders, acquired_users$presentee_mobile==all_orders$passenger_phone, "left_outer")
acquired_users_with_orders <- filter(acquired_users_with_orders, "passenger_phone is not null")

mobiles_acquired_users <-distinct(select(acquired_users_with_orders, "presentee_mobile"))
#write.json(acquired_users_with_orders, "file:///home/rd/spark/bin/20160711_users_convertion.json")

#第0日領券後～第7日結束前，被返利的領券用戶
orders_rebated_within_8days <- join(acquired_users,rebated_orders, acquired_users$presentee_mobile==rebated_orders$passenger_phone, "left_outer")
orders_rebated_within_8days <- filter(orders_rebated_within_8days, "passenger_phone is not null")

cache(orders_rebated_within_8days)
results <- data.frame("name" = c("frist"), "value" = c(0),stringsAsFactors=FALSE)

##程式執行階段2: 開始利用spark進行集合運算。。。。。

#第0日到第7日結束前，券有效期內打過車的領券用戶訂單數據
rules<- "to_date(a.create_time)>='STARTDATE' and to_date(a.create_time)<=date_add(date('STARTDATE'),7)"
rules<-gsub(pattern='STARTDATE', replacement=date_parameter, rules)
orders_within_8days = filter(acquired_users_with_orders, rules)
mobiles_with_orders_within_8days <- distinct(select(orders_within_8days, "presentee_mobile"))

#第8日到第14日結束前，券過期後，打過車的領券用戶訂單數據
rules<- "to_date(a.create_time)>=date_add(date('STARTDATE'),8) and to_date(a.create_time)<=date_add(date('STARTDATE'),15)"
rules<-gsub(pattern='STARTDATE', replacement=date_parameter, rules)
orders_after_8days = filter(acquired_users_with_orders, rules)
mobiles_with_orders_after_8days <- distinct(select(orders_after_8days, "presentee_mobile"))

#第0日到第7日結束前，被返利信息紀錄的領券用戶
mobiles_user_reabted <-distinct(select(orders_rebated_within_8days, "presentee_mobile"))

#券0～7天有效期內首單後未被返利的用戶
mobiles_my_team_losted <- except(mobiles_with_orders_within_8days, mobiles_user_reabted)

#第8日券有效期過後， 14日內，有成交紀錄被sic統計方法，統計進來的用戶
mobiles_after_7days_countedBySicheng <-except(mobiles_with_orders_after_8days, mobiles_user_reabted)

#券0～7天有效期內首單後未被返利的用戶, 第8日到第14日成單，被sic統計轉化的用戶
mobiles_my_team_losted_countedBySicheng <-intersect(mobiles_my_team_losted, mobiles_with_orders_after_8days)

#第8日券有效期過後， 14日內，思成沒有統計的首單用戶
mobiles_both_losted <- except(mobiles_my_team_losted, mobiles_after_7days_countedBySicheng)

#券0～7天有效期內首單後未被返利，後7天沒打車的用戶
mobile_first_order_withno_coupon_no_futher_order_after_7days <- except(mobiles_my_team_losted, mobiles_with_orders_after_8days)

#7日內沒打車，後7日打車的用戶
mobiles_with_order_invoked_coupon <- except(mobiles_with_orders_after_8days, mobiles_with_orders_within_8days)

#領券後15天里打車的用戶, 由於業務特性，可以重覆領券這個存在重覆統計。
mobiles_converted = acquired_users_with_orders

#程式運行階段：輸出結果。。。
results<-rbind(results, c("領新手券的用戶數量", nrow(distinct(select(acquired_users, "presentee_mobile")))))
results<-rbind(results, c("領新手券後15日轉化的用戶數量", nrow(mobiles_acquired_users)))
results<-rbind(results, c("領新手券7日內打車用券轉化的用戶數量", nrow(mobiles_user_reabted)))
results<-rbind(results, c("新手券有效期過期後7日內打車轉化用戶", nrow(mobiles_after_7days_countedBySicheng)))
results<-rbind(results, c("sic統計方法統計的轉化用戶數", nrow(mobiles_user_reabted)+nrow(mobiles_after_7days_countedBySicheng)))
results<-rbind(results, c("7日內首單未用新手券的人數", nrow(mobiles_my_team_losted)))
results<-rbind(results, c("7日內首單未用新手券, 後7日內沒打車的人數", nrow(mobiles_both_losted)))
results<-rbind(results, c("7日內首單未用新手券, 後7日內有打車的人數", nrow(mobiles_my_team_losted_countedBySicheng)))

results<-rbind(results, c("領新手券後7日內未打車, 後7日又打車的人數", nrow(mobiles_with_order_invoked_coupon)))
results

用sparkR， 分析上億條訂單數據的腳本。

用sparkR，分析上億條訂單數據的腳本。