大數據 ODS&DWD&DIM-SQL分享 需求 思路一:等差數列 斷2天、3天,嵌套太多 1.1 開窗,按照 id 分組,同時按照 dt 排序,求 Rank -- linux 中空格不能用 tab 鍵 select id,dt,rank() over(partition by id order b ...
大數據 ODS&DWD&DIM-SQL分享 需求
思路一:等差數列
斷2天、3天,嵌套太多
1.1 開窗,按照 id 分組,同時按照 dt 排序,求 Rank
-- linux 中空格不能用 tab 鍵
select id,dt,rank() over(partition by id order by dt) rk from tx;
1.2 將每行日期減去RK值,如果之前是連續的日期,則相減之後為相同日期
z: 等差
(x1+z)-(y1+z)=x1-y1
select id,dt,date_sub(dt,rk) flg
from (select id,dt,rank() over(partition by id order by dt) rk from tx) t1;
斷一天的數據,flag 變成了連續
1.3 計算絕對連續的天數
select id,flag,count(*) days
from (
select id,dt,date_sub(dt,rk) flg
from (select id,dt,rank() over(partition by id order by dt) rk from tx) t1;
)t2 group by id,flag;
1.4 再計算連續問題
select id,flag,days,rank() over(partition by id order by flag) newFlag
from t3;
1.5 將 flag 減去 newflag
select id,days,date_sub(flag,newFlag) flag
from t4;t5
1.6 計算每個用戶連續登錄的天數,斷一天也算
select id,sum(days)+count(1) days
from t5
group by id,flag;[t6]
1.7 計算最大連續天數
select id,max(days)
from t6
group by id;
準後再-1
思路二
2.1 將上一行數據下移
--下移預設值,一般給 1970-01-01,上移預設值一般 9999-01-01
select id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagDt
from tx; t1
2.2 將當前行日期減去下移的日期
select id,dt,datediff(dt,lagDt) dtDiff
from t1; t2
執行
select id,dt,datediff(dt,lagDt) dtDiff
from (
select id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagDt
from tx) t1;
每碰到一個 >2 的就分組 + 1
2.3 分組
select id,dt,sum(if(dtDiff>2,1,0)) over(partition by id order by dt) flag
from t2; t3
select id,dt,sum(if(dtDiff>2,1,0)) over(partition by id order by dt) flag
from (
select id,dt,datediff(dt,lagDt) dtDiff
from (
select id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagDt
from tx) t1
) t2;
select id,flag,datediff(max(dt),min(dt))+1
from t3
group by id,flag;
帶入執行
--斷3天把2改成3,斷4天把2改成4
select id,flag,datediff(max(dt),min(dt))+1
from (
select id,dt,sum(if(dtDiff>2,1,0)) over(partition by id order by dt) flag
from (
select id,dt,datediff(dt,lagDt) dtDiff
from (
select id,dt,lag(dt,1,'1970-01-01') over(partition by id order by dt) lagDt
from tx) t1
) t2
)t3
group by id,flag;
2.3 求分組後的最大值
HiveOnSpark: 有個BUG, datediff over 子查詢 => null point
解決方案:
- 換MR引擎
- 將時間欄位由 String 類型改成 Date 類型
https://www.bilibili.com/video/BV1Ju411o7f8/?p=69