數據聚合和採樣_ZenDei技術網路在線

基礎聚合常見的聚合函數 count,sum,avg,max,min set hive.map.aggr=true; //mapper端預聚合，提高性能，但消耗較多記憶體註意：只能select group by從句的欄位，否則報錯 select name, gender_age.gender, co... ...
基礎聚合 
常見的聚合函數 count,sum,avg,max,min
set hive.map.aggr=true;      //mapper端預聚合，提高性能，但消耗較多記憶體
註意：只能select group by從句的欄位，否則報錯
select name, gender_age.gender, count(*) as row_cnt  --error!
from employee
group by gender_age.gender;

如何select沒有出現在group by中的欄位？
    1)使用collect_set函數；
select gender_age.gender, collect_set(gender_age.age)[0] as random_age, count(*) as row_cnt
from employee
group by gender_age.gender;

select中使用多個聚合函數
select gender_age.gender, avg(gender_age.age) as avg_age, count(*) as row_cnt
from employee
group by gender_age.gender;

    2)使用分析函數(不需要 group by)
聚合函數和 case when 一起使用
select sum(case when gender_age.gender = "Male" then gender_age.age else 0 end) /
sum(case when gender_age.gender = "Male" then 1 else null end) as man_age_avg    -- 0 == null
from employee;

聚合函數和coalesce, if連用。 case when和if的區別？
coalesce(a1, a2, ...) - Returns the first non-null argument
求age_sum, woman_age_sum
select sum(coalesce(gender_age.age, 0)) as age_sum,
sum(if(gender_age.gender = "Female", gender_age.age, 0)) as woman_age_sum
from employee;

聚合函數不允許嵌套 如：avg(count(*)) error!

聚合函數中使用distinct 
select count(distinct gender_age.gender) as sex_uni_cnt,
count(distinct name) as name_uni_cnt
from employee;
註意：count和distinct一起使用時，只會使用一個reducer，降低性能，解決方法：
select count(*) as sex_uni_cnt
from (
select distinct gender_age.gender from employee
) a;
註意：聚合時若遇到含有null值的列，則忽略該行，如：
--創建含有null行的表
create table t as select * from (
select employee_id - 99 as val1,
employee_id - 98 as val2 from employee_hr
where employee_id <= 101
union all
select null as val1, 2 as val2 from employee_hr
where employee_id = 100
) a;
+---------+---------+--+
| t.val1  | t.val2  |
+---------+---------+--+
| 1       | 2       |
| NULL    | 2       |
| 2       | 3       |
+---------+---------+--+
select sum(val1 + val2) from t;     --第二行會被忽略，解決方法：
select sum(coalesce(val1, 0) + val2) from t;

高級聚合
grouping sets 自定義聚合
內層的grouping sets定義了每個union all中的group by要實現的數據。
select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0] grouping sets((name, work_place[0]));
<==>
select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0];

外層的grouping sets定義了union all要實現的數據。
select name, work_place[0] as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name, work_place[0] grouping sets((name), (work_place[0]));
<==>
select name, null as main_place, count(employee_id) as emp_id_cnt
from employee_id
group by name
union all
select null as name, work_place[0] as main_place, count(employee_id) as employee_id_cnt
from employee_id
group by work_place[0];

內外層grouping by混合使用
select name, work_place[0] as main_place, count(employee_id) as employee_id_cnt
from employee_id
group by name, work_place[0] grouping sets((name, work_place[0]), name, work_place[0], ());

煮魚：grouping sets的內容為表或記錄的別名所指向的列時，可能會出現問題，已修複。如：
select gender_age.gender, gender_age.age,
count(name) as name_uni_cnt
from employee
group by gender_age.gender, gender_age.age grouping sets((gender_age.gender, gender_age.age));

rollup and cube

rollup 創建n + 1級聚合，n為group by中的分組列
group by a, b, c with rollup <==> grouping set((a, b, c), (a, b), (a), ())
cube  創建2^n級聚合
group by a, b, c with cube 

grouping_id 表示行的列是否聚合的位向量的10進位形式??????????
select grouping_id, bin(cast(grouping_id as bigint)) as bit_vector,
name, start_date, count(employee_id) emp_id_cont
from employee_hr
group by start_date, name
with cube 
order by start_date;

having - 對group by的結果進行過濾，避免分組以後再使用子查詢
將員工按年齡分組，統計:人數<=1的年齡組
select gender_age.age 
from employee_id
group by gender_age.age having count(*) <= 1;
<==>
select a.age  
from (
select gender_age.age as age, count(*) as cnt    --內查詢中的列一定要給個別名
from employee_id
group by gender_age.age
) a
where a.cnt <= 1;  --where中不支持UDAF函數，要起個別名

分析函數
分析函數與聚合函數的區別？
聚合函數會分組，然後為每個組生成一個結果；分析函數不對結果集分組
Function(arg1,..., argn) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])
或者 P184
function(arg1,..., argn) over w as w_n
...
window
w as ([PARTITION BY <...>] [ORDER BY <....>] [<window_clause>])

Function(arg1,..., argn):
    標準的聚合函數
    分析函數：不能在視窗從句中使用，還有lead，lag。
        ntile:將排序好的數據集分桶，並且給每行分配一個合適的桶號，適用於將數據等分

        rank:組內元素進行編號
        dense_rank:組內元素進行編號，併排的下一個元素步進1
        row_number:安照排序和分區給元素編號，每組從1開始
        percent_rank:??

        cume_dist:??計算 <= 給定值的行數， 給定值 = 總行數 / 當前行            
    視窗函數：
        lead(value_expr,[offset],[default]):視窗(組)內指定列的行向下前進offset行
        lag(value_expr,[offset],[default]):視窗(組)內指定列的行向上滯後offset行
        first_value:
        last_value:
over + :
    標準的聚合函數
    註意order by只支持一個排序鍵，變通方法：使用rows unbounded preceding視窗從句
    partition by + :基本類型的一個或多個分區列 why?
    partition by + sorted by：任意類型的一個或多個分區列/排序列
    視窗從句：視窗和組的區別？視窗是分區內的某個範圍
        作用：細顆粒度
        類型：
          ⚠️：升序時，star_expr必須小於end_expr,否則整列值為null或報異常
          行類型視窗    ROWS BETWEEN <start_expr> AND <end_expr> 當前行之前或之後的行數
          範圍類型視窗     RANGE BETWEEN <start_expr> AND <end_expr>
              兩視窗的不同點：range僅支持一個排序列，且此列只能為數字或日期。視窗的範圍由排序列決定

        若定義了order by,缺失視窗從句。預設為：range between unbounded preceding and current row 視窗為當前分區的起點到當前行
        都缺失。預設為：rows between unbounded preceding and unbounded following 視窗為當前分區

create table if not exists employee_contract (
name string,
dept_num int,
employee_id int,
salary int,
type string,
start_date date
)
row format delimited
fields terminated by '|'
stored as textfile;
load data local inpath "/home/centos/hiveessential/ch06/employee_contract.txt"
overwrite into table employee_contract;

select name, dept_num, salary,
count(*) over (partition by dept_num) as row_cnt,  --組內求和
sum(salary) over (partition by dept_num order by dept_num) as deptTotal,   --    組內數據累加
sum(salary) over (order by dept_num) as runingTotol1,   --各組之和累加， ？？？？
sum(salary) over (order by dept_num, name rows unbounded preceding) as runingTotol2  --看不懂
from employee_contract
order by dept_num, name;

set hive.strict.checks.large.query=false;
set hive.mapred.mode=nonstrict;

select name, dept_num, salary,
count(*) over (partition by dept_num) as row_cnt,
sum(salary) over(partition by dept_num order by dept_num) as deptTotal,  
sum(salary) over(order by dept_num) as runingTotol1,   
sum(salary) over(order by dept_num, name rows unbounded preceding) as runingTotol2
from employee_contract
order by dept_num, name;
<==>獨立的視窗從句
select name, dept_num, salary,
count(*) over row_cnt,
sum(salary) over deptTotal,  
sum(salary) over runingTotol1,   
sum(salary) over runingTotol2
from employee_contract
order by dept_num, name
window
row_cnt as (partition by dept_num),
overdeptTotal as (partition by dept_num order by dept_num),
runingTotol1 as (order by dept_num),
runingTotol2 as (order by dept_num, name rows unbounded preceding);

//範圍查詢
select name, dept_num, salary, start_year,
max(salary) over (partition by dept_num order by start_year 
range between 2 preceding and current row) as win1
from (
select name, dept_num, salary, year(start_date) as start_year
from employee_contract
) a;

抽樣
1. 隨機取樣??
select name from employee_id
distribute by rand() sort by rand()
limit 5;
2. 分桶表抽樣
SELECT * FROM <BucketTable_Name> 
TABLESAMPLE(BUCKET <specified bucket number to sample> OUT OF <total number of buckets> ON [colname|RAND()]) table_alias;
select name from employee_id_buckets
tablesample(bucket 1 out of 2 on rand()) a;
3. 塊抽樣
SELECT * FROM <Table_Name> 
TABLESAMPLE(N PERCENT|ByteLengthLiteral|N ROWS) s;
按百分比大小抽樣
select name from employee_id_buckets
tablesample (20 percent) a;
按行數抽樣
select name from employee_id
tablesample (5 rows) a;
按位元組大小抽樣
select name from employee_id
tablesample (700B) a;