hive入門到精通 hive部署 啟動Hadoop # 啟動hadoop start-all.sh # 檢查hadoop進程 jps # 檢查各埠 netstat -aplnt | grep java 檢查MySQL是否啟動成功 ps -aux | grep mysql netstat -apln ...
目錄
hive入門到精通
hive部署
啟動Hadoop
# 啟動hadoop
start-all.sh
# 檢查hadoop進程
jps
# 檢查各埠
netstat -aplnt | grep java
檢查MySQL是否啟動成功
ps -aux | grep mysql
netstat -aplnt | grep 3306
安裝hive
# 將軟體上傳到 /opt/soft 目錄
# 解壓hive
tar -zxvf apache-hive-3.1.3-bin.tar.gz
# 目錄改名
mv apache-hive-3.1.3-bin hive3
# 進入配置文件目錄
cd /opt/soft/hive3/conf
# 複製配置文件
cp hive-env.sh.template hive-env.sh
cp hive-default.xml.template hive-site.xml
# 編輯環境配置文件
vim hive-env.sh
# 編輯配置文件
vim hive-site.xml
hive-env.sh
# hadoop 安裝路徑
export HADOOP_HOME=/opt/soft/hadoop3/
# hive 配置文件路徑
export HIVE_CONF_DIR=/opt/soft/hive3/conf/
hive-site.xml
需要修改的位置提煉如下:
<configuration>
<!-- 記錄HIve中的元數據信息 記錄在mysql中 -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://spark03:3306/hive?useUnicode=true&createDatabaseIfNotExist=true&characterEncoding=UTF8&useSSL=false&serverTimeZone=Asia/Shanghai</value>
</property>
<!-- jdbc mysql驅動 -->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
</property>
<!-- mysql的用戶名和密碼 -->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>Lihaozhe!!@@1122</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>hive.exec.scratchdir</name>
<value>/user/hive/tmp</value>
</property>
</property>
<property>
<name>hive.exec.local.scratchdir</name>
<value>/user/hive/local</value>
<description>Local scratch space for Hive jobs</description>
</property>
<property>
<name>hive.downloaded.resources.dir</name>
<value>/user/hive/resources</value>
<description>Temporary local directory for added resources in the remote file system.</description>
</property>
<!-- 日誌目錄 -->
<property>
<name>hive.querylog.location</name>
<value>/user/hive/log</value>
</property>
<!-- 設置metastore的節點信息 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://spark01:9083</value>
</property>
<!-- 客戶端遠程連接的埠 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>hive.server2.webui.host</name>
<value>0.0.0.0</value>
</property>
<!-- hive服務的頁面的埠 -->
<property>
<name>hive.server2.webui.port</name>
<value>10002</value>
</property>
<property>
<name>hive.server2.long.polling.timeout</name>
<value>5000</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>true</value>
</property>
<!--
<property>
<name>datanucleus.autoCreateSchema</name>
<value>false</value>
</property>
<property>
<name>datanucleus.fixedDatastore</name>
<value>true</value>
</property>
-->
<property>
<name>hive.execution.engine</name>
<value>mr</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
<description>
Enforce metastore schema version consistency.
True: Verify that version information stored in is compatible with one from Hive jars. Also disable automatic
schema migration attempt. Users are required to manually migrate schema after Hive upgrade which ensures
proper metastore schema migration. (Default)
False: Warn if the version information stored in metastore doesn't match with one from in Hive jars.
</description>
</property>
</configuration>
註意:上面配置文件中的路徑在 vi 編輯器下 全局替換
:%s@\${system:java.io.tmpdir}@/tmp/hive-logp@g
不要使用圖形化 不然每次保存後3215行都會有個  特殊字元 如果產生刪除即可 具體報錯信息 後面有單獨的描述
上傳 MySQL 連接驅動 jar 包到 hive 安裝目錄的lib目錄下:
/opt/soft/hive3/lib
jar 包有兩個 分別為:
- mysql-connector-java-8.0.33.jar
- protobuf-java-3.22.2.jar
刪除原有的 protobuf-java-2.5.0.jar 文件
guava版本衝突
刪除 hive/lib目錄中的 guava-19.0.jar
拷貝hadoop/share/hadoop/common/lib目錄中的 guava-27.0-jre.jar 到 hive/lib 目錄
rm -f /opt/soft/hive3/lib/guava-19.0.jar
cp -v /opt/soft/hadoop3/share/hadoop/common/lib/guava-27.0-jre.jar /opt/soft/hive3/lib
配置環境變數
vim /etc/profile
export HIVE_HOME=/opt/soft/hive3
export PATH=$PATH:$HIVE_HOME/bin
source /etc/profile
初始化hive的元資料庫
註意:初始初始元素中庫之前 保證 hadoop 和 mysql 正常啟動
schematool -initSchema -dbType mysql
Exception in thread "main" java.lang.RuntimeException: com.ctc.wstx.exc.WstxParsingException: Illegal character entity: expansion character (code 0x8
at [row,col,system-id]: [3215,96,"file:/usr/local/hive/conf/hive-site.xml"]
at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:3051)
...
at org.apache.hadoop.util.RunJar.main(RunJar.java:236)
Caused by: com.ctc.wstx.exc.WstxParsingException: Illegal character entity: expansion character (code 0x8
at [row,col,system-id]: [3215,96,"file:/usr/local/hive/conf/hive-site.xml"]
at com.ctc.wstx.sr.StreamScanner.constructWfcException(StreamScanner.java:621)
...
at org.apache.hadoop.conf.Configuration.loadResource(Configuration.java:3034)
... 17 more
報錯原因:
hive-site.xml配置文件中,3215行(見報錯記錄第二行)有特殊字元
解決辦法:
進入hive-site.xml文件,跳轉到對應行,刪除裡面的  特殊字元即可
Caused by: java.net.URISyntaxException: Relative path in absolute URI: ${system:java.io.tmpdir%7D/$%7Bsystem:user.name%7D
at java.net.URI.checkPath(URI.java:1822)
at java.net.URI.<init>(URI.java:745)
at org.apache.hadoop.fs.Path.initialize(Path.java:260)
解決方案:將hive-site.xml配置文件的
hive.querylog.location
hive.exec.local.scratchdir
hive.downloaded.resources.dir
三個值(原始為$標識的相對路徑)寫成絕對值
# 全局替換
:%s@\${system:java.io.tmpdir}@/tmp/hive-logp@g
遠程模式
# 啟動服務端
hive --service metastore &
hive --service hiveserver2 &
# 後臺運行
nohup hive --service metastore > /dev/null 2>&1 &
nohup hive --service hiveserver2 > /dev/null 2>&1 &
hiveserver2 start
nohup hiveserver2 >/dev/null 2>&1 &
# 客戶端連接
hive
beeline -u jdbc:hive2://spark01:10000 -n root
beeline jdbc:hive2://spark01:10000> show databases;
體驗
use default;
create table person (
id int,
phonenum bigint,
salary double,
name string
);
create table ps (
id int,
phonenum bigint,
salary double,
name string
);
show tables;
insert into person values (1001,13966668888,9999.99,"張三");
songsong,bingbing_lili,xiao song:18_xiaoxiao song:19
longlong,pingping_liuliu,xiao long:8_xiaoxiao long:9
drop table person;
create table person (
name string,
friends array<string>,
childrens map<string,int>
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n';
load data local inpath '/root/person.txt' into table person;
drop table data;
create table data (
name string,
amount int
)
row format delimited fields terminated by ','
lines terminated by '\n';
load data local inpath '/root/data.txt' into table data;
select count(*) from data;
select count(*) from data group by name;
select name,max(t) from data group by name;
select name,max(t) from data group by name order by max(t) ;
編程
DDL
操作資料庫
創建資料庫
-- 創建資料庫不指定路徑
create database db_hive01;
-- 創建資料庫指定 hdfs 路徑
create database db_hive02 location '/db_hive02';
-- 創建資料庫附加 dbproperties
create database db_hive03 with dbproperties ('create-date'='2023-04-17','create_author'='lihaozhe');
查詢資料庫
-- 查看所有資料庫
show databases;
-- 模糊查看所有資料庫
-- * 代表所有
-- | 代表或
show databases like 'db_hive*';
-- 查看資料庫信息
desc database db_hive03;
-- 查看資料庫詳盡信息
describe database db_hive03;
-- 查看資料庫更詳盡信息
describe database extended db_hive03;
修改資料庫
-- 修改 dbproperties
alter database db_hive03 SET dbproperties ('crate_data'='2023-04-18');
-- 修改location
alter database db_hive02 SET location '/db_hive002';
-- 修改 owner user
alter database database_name set owner user lhz;
刪除資料庫
-- 刪除空資料庫
drop database db_hive02 restrict;
-- 刪除非空資料庫
drop database db_hive03 cascade;
切換資料庫
use db_hive01;
DML
操作數據表
普通表
臨時表 temporary
外部表 external
-- 利用 select 語句查詢結果 創建一張表
create table as select
-- 復刻一張已經存在的表結構 但是 不包含數據
create table like
基本數據類型
數據類型 | 說明 | 定義 |
---|---|---|
tinyint | 1 byte 有符號整型 | |
smallint | 2 byte 有符號整型 | |
int | 4 byte 有符號整型 | |
bigint | 8 byte 有符號整型 | |
float | 4 byte 單精度浮點數 | |
double | 8 byte 雙精度浮點數 | |
dicimal | 十進位精準數據類型 | |
varchar | 字元序列 需要指定最大長度 範圍[1~65535] | |
string | 字元串 無需指定最大長度 | |
timestamp | 時間 | |
binary | 二進位數據 | |
boolean | true false | |
array | 一組相同數據類型的集合 | array |
map | 一組相同數據類型的鍵值對 | map<string,int> |
struct | 由多個屬性組成,每個屬性都有自己的屬性名和數據類型 | structid:int,name:string |
內部表
簡單表
create table person (
id int,
phonenum bigint,
salary dicimal,
name string
);
show tables;
insert into person values (1001,13966668888,9999.99,"張三");
簡單數據類型
create table data (
name string,
amount int
)
row format delimited fields terminated by ','
lines terminated by '\n'
location '/user/hive/warehouse/lihaozhe.db/data';
# 上傳文件到Hive表指定的路徑
hdfs dfs -put /root/data.csv /user/hive/warehouse/lihaozhe.db/data
複雜數據類型
vim /root/person.txt
songsong,bingbing_lili,xiao song:18_xiaoxiao song:19
longlong,pingping_liuliu,xiao long:8_xiaoxiao long:9
drop table person;
create table person (
name string,
friends array<string>,
childrens map<string,int>
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n';
load data local inpath '/root/person.txt' into table person;
json數據類型
json函數
get_json_object
json_tuple
json serde載入數據
--serialization 序列化 --deserialization 反序列化
{"name":"user01","amount":"100"}
{
"name":"lhz",
"friends":["lize","lanlan","manman"],
"students":[
"xiaohui":15000,"huixiaoxiao":18000
],
"address":{
"province":"jilin",
"city":"liaoyuan",
"district":"liaoyuan"
}
}
-- 案例一
create table video (info string);
load data local inpath '/root/video.log' into table video;
select * from video limit 10;
select count(*) from video;
select
get_json_object(info,'$.id') as id,
get_json_object(info,'$.nickname') as nickname,
get_json_object(info,'$.gold') as gold
from video limit 5;
select
json_tuple(info,'id','nickname',"gold") as (id,nickname,gold)
from video limit 5;
案例二
create table video(
id string ,
uid string,
nickname string,
gold int,
watchnumpv int,
watchnumuv int,
hots int,
nofollower int,
looktime int,
smlook int ,
follower int ,
gifter int ,
length int ,
area string ,
rating varchar(1),
exp int ,
type string
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe';
load data local inpath '/root/video.log' into table video;
--把json數據,按類給格式化,
STORED AS TEXTFILE;--文本文檔的形式
--導入數據
load data local inpath '/root/video.log' into table video;
創建表的loaction'' 作用是 集群裡邊有對應的數據,則直接將數據載入到表中
load data loacl inpath '' into table 是使用hive將數據從本地載入到已經建好的表中
load data inpath '' into table 是使用hive將數據從集群裡邊載入到已經建好的表中
外部表
create external table data (
name string,
amount int
)
row format delimited fields terminated by ','
lines terminated by '\n'
location '/user/hive/warehouse/lihaozhe.db/data';
部表與外部表轉換
-- 內部表轉外部表
alter table tblName set tblproperties('external'='true');
-- 外部表轉內部表
alter table tblName set tblproperties('external'='false');
查看表
-- 查看表
show tables;
-- 查看某資料庫下的某張表
show tables in lihaozhe;
-- 查看表
show tables;
-- 模糊查看數據表
-- * 代表所有
-- | 代表或
show tables like 'per*';
-- 查看基本表信息
describe person;
-- 查看基本表詳細信息
describe extended person;
-- 查看基本表詳細信息並格式化展示
describe formatted person;
修改表
-- 修改表名稱
alter table person rename to tb_user;
-- 添加欄位 向末尾追加
alter table tb_user add columns (gender tinyint);
-- 修改欄位名稱及類型
alter table tb_user change gender age smallint;
-- 刪除欄位
刪除表
drop table tb_user
清除表
truncate table video;
DQL
準備數據
-- 部門表 dept.csv
10,行政部,1700
20,財務部,1800
30,教學部,1900
40,銷售部,1700
hdfs dfs -mkdir -p /quiz01/dept
hdfs dfs -put /root/dept.csv/quiz01/dept
create external table dept(
dept_id int comment '部門id',
dept_name string comment '部門名稱',
location_code int comment '部門位置'
)
comment '部門表'
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz01/dept';
load data local inpath '/root/dept.csv' into table dept;
員工表
7369,張三,研發,800.00,30
7499,李四,財務,1600.00,20
7521,王五,行政,1250.00,10
7566,趙六,銷售,2975.00,40
7654,侯七,研發,1250.00.30
7698,馬八,研發,2850.00,30
7782,金九,行政,2450.0,30
7788,銀十,行政,3000.00,10
7839,小芳,銷售,5000.00,40
7844,小明,銷告,1500.00,40
7876,小李,行政,1100.00,10
7900,小元,講師,950.00,30
7902,小海,行政,3000.00,10
7934,小紅明,講師,1300.00,30
hdfs dfs -mkdir -p /quiz01/emp
hdfs dfs -put /root/emp.csv /quiz01/emp
create external table emp
(
emp_id int comment '員工ID',
emp_name string comment '員工姓名',
emp_job string comment '員工崗位',
emp_salary decimal(8, 2) comment '員工薪資',
dept_id int comment '員工隸屬部門ID'
)
comment '員工表'
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz01/emp' ;
load data local inpath '/root/emp.csv' into table emp;
居民表 person.csv
hdfs dfs -mkdir -p /quiz02/person
hdfs dfs -put /root/person.csv /quiz02/person
CREATE external TABLE `person` (
`id` int COMMENT '主鍵',
`id_card` varchar(18) COMMENT '身份證號碼',
`mobile` varchar(11) COMMENT '中國手機號',
`real_name` varchar(15) COMMENT '身份證姓名',
`uuid` varchar(32) COMMENT '系統唯一身份標識符'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz02/person';
load data local inpath '/root/person.csv' into table person;
地區表 region.csv
hdfs dfs -mkdir -p /quiz02/region
hdfs dfs -put /root/region.csv /quiz02/region
CREATE external TABLE `region` (
`parent_code` int COMMENT '當前地區的上一級地區代碼',
`region_code` int COMMENT '地區代碼',
`region_name` varchar(10) COMMENT '地區名稱'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz02/region';
load data local inpath '/root/region.csv' into table region;
單表查詢
-- 查詢所有
select * from dept;
-- 按照指定欄位查詢
select dept_name from dept;
-- 列別名
select dept_name as name from dept;
-- limit 分頁查詢
select * from emp limit 5,5
-- where 按條件查詢
select * from emp where dept_id = 10;
-- 關係運算符
-- = != > >= < <=
-- in
select * from emp where dept_id in (20,40);
-- not in
select * from emp where dept_id not in (20,40);
-- like
select * from emp where emp_name like '小%';
-- not like
select * from emp where emp_name not like '小%';
-- 邏輯運算符
-- and
select * from emp where dept_id = 30 and emp_salary > 1000;
-- between and
select * from emp where dept_id = 30 and emp_salary > 1000 and emp_salary < 2000;
select * from emp where dept_id = 30 and emp_salary between 1000 and 2000;
--not between and
select * from emp where dept_id = 30 and emp_salary not between 1000 and 2000;
-- or
select * from emp where dept_id = 10 or dept_id = 40;
-- not !
select * from emp where dept_id != 10;
select * from emp where not dept_id = 10;
-- 聚合函數
-- count(*) count(1) count(column_name)
select count(*) from emp;
select count(*) as total from emp;
-- max
select max(emp_salary) from emp;
-- min
select min(emp_salary) from emp;
-- sum
select sum(emp_salary) from emp;
-- avg
select avg(emp_salary) from emp;
-- group by 分組查詢
select dept_id, avg(emp_salary) as avg_salary from emp group by dept_id;
-- having
select dept_id, avg(emp_salary) as avg_salary from emp group by dept_id having avg_salary > 2000;
-- where having
select dept_id, avg(emp_salary) as avg_salary from emp where dept_id != 10 group by dept_id having avg_salary > 2000;
-- order by 全局排序
select * from emp order by dept_id desc ,emp_salary desc;
select dept_id, max(emp_salary) from emp group by dept_id;
select dept_id, max(emp_salary) as max_salary from emp group by dept_id order by max_salary desc;
-- sort by (每個reduce)內部排序
select * from emp sort by dept_id desc
-- 查看 reduce 數量
set mapreduce.job.reduces;
-- 設置 reduce 數量 僅在當前連接有效 連接斷開失效
set mapreduce.job.reduces=2;
select * from emp sort by dept_id desc;
-- 將查詢結果寫入到文件
insert overwrite local directory '/root/sort-result' select * from emp sort by dept_id desc;
-- distribute by 分區 類似與 mapreduce 中的 partation 自定義分區
set mapreduce.job.reduces=2;
insert overwrite local directory '/root/distribute-result' select * from emp distribute by dept_id sort by emp_salary desc;
-- distribute by 分區規則 根據欄位的hash值 與 reduce 的數量 進行相除 餘數 相同的在到一個分區
-- hvie 要求 distribute by 語句執行 在 sort by 語句之前
-- 執行結束之後 將 mapreduce.job.reduces 設置為 -1 不然 會影響 分區 分桶 load
-- cluster by 只能升序 不能降序 cluster by = sort by + distribute by
select * from emp cluster by dept_id;
多表查詢
-- 笛卡爾積
select * from dept,emp;
-- 避免笛卡爾積
select * from dept,emp where dept.dept_id = emp.dept_id
-- 等值json 內連接
select * from dept join emp where dept.dept_id = emp.dept_id
-- left join 左外連接
select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
from dept d left join emp e where d.dept_id = e.dept_id;
-- right join 右外連接
select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
from dept d right join emp e where d.dept_id = e.dept_id;
-- full join 滿外連接
select d.dept_id,d.dept_name,d.location_code,e.emp_id,e.emp_name,e.emp_jpb,e.emp_salary
from dept d full join emp e where d.dept_id = e.dept_id;
-- union 上下拼接 去重
select * from emp where dept_id = 10 or dept_id = 40;
select * from emp where dept_id in(10,40);
select * from emp where dept_id = 10 union select * from emp where dept_id = 40;
-- union all 上下拼接 不去重
select * from emp where dept_id = 10 or dept_id = 40;
select * from emp where dept_id in(10,40);
select * from emp where dept_id = 10 union all select * from emp where dept_id = 40;
-- 自關聯
select * from region where region_code='220422';
--
函數
# 設施本地模式
set hive.exec.mode.local.auto=true;
set mapperd.job.tracker=local
-- 算術運算符
-- + — * / % & | ~
-- 數值運算
-- round 四捨五入
select round (3.3) as num;
-- ceil 向上取整
select ceil(3.3) as num;
-- floor 向下取整
select floor(3.3) as num;
-- 字元串
-- 截取 substr(column_name,start_index,length)
select substr(id_card,3,3) from person;
-- substring(column_name,start_index,length)
select substring(id_card,3,3) from person;
-- spilt 字元串切割
select split('2023-04-19','-');
-- nvl 判空 替換 null 值
select nvl("lhz",1);
select nvl(null,1);
-- replace 字元串替換
SELECT REPLACE('aaa.mysql.com','a','w');
-- concat 字元串拼接
select concat('slogan','-','tlbyxzcx');
-- concat 字元串拼接
select concat_ws('-',array('2022','04','19'));
-- get_json_object 解析 json 字元串
select get_json_object('[{"name":"lhz","age":41}]','$.name') as name;
select get_json_object('[
{"name":"lhz","age":41},
{"name":"lz","age":14}
]','$.[0].name')
-- json_tuple
select json_tuple('{"name":"lhz","age":41}','name','age') as (name,age);
-- 日期函數
-- unix 時間戳
--(1970.01.01 00:00:00 GMT UTC 時間)
-- unix_timestamp 返回 bigint 無時區
select unix_timestamp();-- 1681951622 時間秒數
select unix_timestamp('1983-11-22 20:30:00','yyyy-MM-dd HH:mm:ss'); -- 438381000
-- from_unixtime
select from_unixtime(438381000); -- 1983-11-22 20:30:00
-- current_date
select current_date();
-- current_timestamp
select current_timestamp();
-- year month day hours minute second
select year('1983-01-23');
-- datediff 兩個日期相差天數(結束日期減去開始日期)
select datediff('1983-01-23','1982-12-31')
-- date_add 日期增加幾天
select date_add('1995-01-01',15);
-- date_sub 日期減少幾天
select date_sub('1995-01-01',15);
-- date_format 日期格式化
select date_format ('1983-11-22 20:30:00','yyyy年MM月dd日 HH時mm分ss秒');
-- 讀取身份證獲取出生日期 輸出格式為 yyyy-MM-dd
-- 1、字元串截取 2、日期格式化
select substr(id_card,7,8) from person limit 3;
select unix_timestamp(substr(id_card,7,8),'yyyyMMdd') from person limit 3;
select from_unixtime(unix_timestamp(substr(id_card,7,8),'yyyyMMdd')) from person limit 3;
select substr(from_unixtime(unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),1,10) from person limit 3;
-- 流程式控制制語句
-- case when
-- >90 a,80~90 b, 70~80 c, 60~70 的,<60 e
select
stu_id,course_id,
case
when score >= 90 then 'A'
when score >= 80 then 'B'
when score >= 70 then 'c'
when score >= 60 then 'D'
else'E'
end as grade
From score;
-- if 三目運算 if(條件表達式,條件為真的返回結果,條件為假的返回結果)
select if(1=2,'托尼','瑪麗') as `髮型師`
-- 結合字元串函數 時間函數 流程式控制制函數 計算身份證信息
-- 根據身份證號 判斷性別 身份證號 第十七位 奇數為男性 偶數為女性
select id_card, if(mod(substr(id_card,17,1),2) = 1,'精神小伙兒','扒蒜老妹兒') gender from person;
-- 根據身份證號 找出所有男性信息
select *, mod(substr(id_card,17,1),2) gender from person where mod(substr(id_card,17,1),2) = 1;
-- 根據身份證號 計算男性人數和女性人數
select
if(mod(substr(id_card,17,1),2) = 1,'精神小伙兒','扒蒜老妹兒') gender ,
count(*) gender_count
from person group by mod(substr(id_card,17,1),2) limit 10;
-- 根據身份證號 計算生日排序
select
date_format(from_unixtime( unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),'yyyy-MM-dd') as `birthday`
from person
order by unix_timestamp(`birthday`,'yyyy-MM-dd') desc
-- 根據身份證號 計算年齡
-- 1、當前月份-出生月份 > 0 說明 已經過完生日 及 使用 當前年份 - 出生年份 = 年齡
-- 2、當前月份-出生月份 < 0 說明 未過生日 及 使用 當前年份 - 出生年份 -1 = 年齡
-- 3、當前月份-出生月份 = 0
-- 3.1、當前日-出生日 > 0 說明 已經過完生日 及 使用 當前年份 - 出生年份 = 年齡
-- 3.2、當前日-出生日 < 0 說明 未過生日 及 使用 當前年份 - 出生年份 -1 = 年齡
-- 3.3、當前日-出生日 = 0 說明 生日視作過完了 及 使用 當前年份 - 出生年份 = 年齡
select if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd')))
)
)
)
) as `age`
from person;
-- 集合函數
-- size 集合中元素的數量
select size(array(0,1,2,3,4,5));
-- array 聲明一個集合 數組
select array(0,1,2,3,4,5) as nums;
-- array_contains 判斷array中是否包含某元素 返回值是 布爾值 tury false
select array_contains(array(0,1,2,3,4,5),3) as num;
-- sort_array 排序 目前只能升序
select sort_array(array(0,1,2,3,4,5));
-- struct 聲明結構體屬性名稱
select struct('real_name','lhz','age','41');
-- {"col1":"real_name","col2":"lhz","col3":"age","col4":"41"}
-- named_struct 聲明結構體屬性和值
select named_struct('real_name','lhz','age','41');
-- {"real_name":"lhz","age":"41"}
-- 集合函數
-- map
select map('xz',1000,'js',800);
-- {"xz":1000,"js":800}
-- map_keys 返回 map 中所有的 key
select map_keys(map('xz',1000,'js',800));
-- map_values 返回 map 中所有的 value
select map_values(map('xz',1000,'js',800));
-- if 三目運算
select if(條件表達式,條件為真表達式,條件為假表達式)
練習
數據:
學生表
講師表
課程表
分數表
學生表 student.csv
hdfs dfs -mkdir -p /quiz03/student
hdfs dfs -put /root/student.csv /quiz03/student
load data local inpath '/root/student.csv' into table student;
課程表 course.csv
hdfs dfs -mkdir -p /quiz03/course
hdfs dfs -put /root/course.csv /quiz03/course
load data local inpath '/root/course.csv' into table course;
分數表 score.csv
hdfs dfs -mkdir -p /quiz03/score
hdfs dfs -put /root/score.csv /quiz03/score
load data local inpath '/root/course.csv' into table course;
-- 學生表
create external table student (
stu_id string comment '學生ID',
stu_name string comment '學生姓名',
birthday string comment '出生年月',
gender string comment '學生性別'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/student';
--教師表
create external table teacher (
tea_id string comment '課程ID',
tea_name string comment '教師名稱'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/teacher';
--課程表
create external table course (
coures_id string comment '課程ID',
coures_name string comment '課程名稱',
tea_id string comment '講師ID'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/course';
--成績表
create external table score (
stu_id string comment '學生ID',
coures_id string comment '課程ID',
score string comment '成績'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz03/score';
綜合練習
-- 查詢所有學生信息
select * from student;
-- 查詢周姓學生信息
select * from student where stu_name like '周%';
-- 查詢周姓學生數量
select count(*) as zhou_count from student where stu_name like '周%';
-- 查詢 學生ID 004 的分數 超過 85 的成績
select * from score where stu_id = 004 and score > 85;
-- 查詢 學生ID 004 的分數 超過 85 的成績
select * from score where stu_id = 004 and score > 85;
-- 查詢 學生程ID 004 的成績降序
select * from score where stu_id = 01 order by score desc;
-- 查詢 數學成績不及格學生及其對應的成績 安裝學生ID排序
select stu.stu_id, stu.stu_name,s.score
from student stu join course c join score s
on stu.stu_id = s.stu_id and c.course_id = s.course_id
and c.course_name = '數學' and s.score < 60
order by stu.stu_id;
-- 查詢男女生人數
select gender,count(*) as gender_count from student group by gender;
-- 查詢編號為 02 的課程平均成績
select avg(score) from score where course_id = 02;
-- 查詢每科課程平均成績
select avg(score) from score group by course_id;
-- 查詢參加考試學生人數
select count(distinct stu_id) as stu_count from score;
-- 查詢每科有多少學生參加考試
select course_id,count(*) as stu_count from score group by course_id;
-- 查詢未參加考試的學生信息
select stu_id,stu_name from student where stu_id not in (
select distinct stu.stu_id from student stu left join course c left join score s
on stu.stu_id = s.stu_id and c.course_id = s.course_id
order by stu.stu_id
)
-- 查詢平均成績及格(60分)的學生的平均成績
select stu_id, avg(score) avg_score
from score
group by stu_id
having avg_score >= 60;
-- 查詢選修至少 4 門 以上課程學生的學號
select stu_id,count(course_id) course_count from score
group by stu_id
having course_count >= 4;
-- 查詢姓氏相同學生名單 並且同姓人數大於 2 的姓氏
select first_name ,count(*) first_name_count from (
select stu_id,stu_name,substr(stu_name,1,1) as first_name
from student
) ts
group by ts.first_name
having first_name_count > 1;
-- 查詢每門功課的學生的平均成績 按照平均成績升序 平均成績相同按照課程編號降序
select course_id, avg(score) avg_score
from score
group by course_id
order by avg_score,course_id desc;
-- 統計參加考試人數大於等於 15 的學科
select course_id,count(*) as stu_count from score group by course_id having stu_count > 15;
-- 查詢學生總成績並按照總成績降序排序
select stu_id, sum(score) sum_score
from score
group by stu_id
order by sum_score desc;
-- 按照指定格式顯示 stu_id 語文 數學 英語 選課數 平均成績
select
s.stu_id,
sum(`if`(c.course_name='語文',score,0)) as `語文`,
sum(`if`(c.course_name='數學',score,0)) as `數學`,
sum(`if`(c.course_name='英語',score,0)) as `英語`,
count(s.course_id) as `選課數`,
avg(s.score) as `平均成績`
from course c left join score s
on c.course_id = s.course_id
group by s.stu_id
order by `平均成績` desc;
-- 查詢一共參加了三門功課且其中一門為語文的學生id 和 姓名
select s.stu_id,stu_name from
(select t1.stu_id ,count(t1.course_id) course_count from
(select stu_id,course_id from score
where stu_id in ( select stu_id from score where course_id = "01")
) t1 group by t1.stu_id having course_count >=3
) t2 join student s on t2.stu_id = s.stu_id;
-- 分解
-- 查詢該學生的姓名
select s.stu_id,stu_name from
-- 成績表中學習科目數量 >=3 科的學生
(select t1.stu_id ,count(t1.course_id) course_count from
-- 報名了語文的學生還報名了那些學科
(select stu_id,course_id from score
where stu_id in (
-- 查詢報名了語文的學生ID
select stu_id from score where course_id = "01"
)
) t1 group by t1.stu_id having course_count >=3
) t2 join student s on t2.stu_id = s.stu_id;
-- 查詢兩門以上的課程不及格學生的學號及其平均成績
-- 1、先按照學生分組 過濾出成績低於60的數量 大於1
-- 2、計算所有學生的平均成績
-- 3、兩個子查詢相互join
select t1.stu_id,t2.avg_score from
(select stu_id, sum(if(score < 60, 1, 0)) as result from score group by stu_id having result > 1) t1
left join
(select stu_id,avg(score) as avg_score from score group by stu_id) t2 on t1.stu_id =t2.stu_id;
-- 查詢所有學生的學號、姓名、選課數、總成績
select
stu.stu_id,stu.stu_name,count(s.course_id) count_course ,nvl(sum(s.score),0) total_score
from student stu left join score s on stu.stu_id = s.stu_id
group by stu.stu_id, stu.stu_name order by stu.stu_id;
-- 平均成績大於 85 的所有學生的學號、姓名、平均成績
select
stu.stu_id,stu.stu_name ,nvl(avg(s.score),0) as `avg_score`
from student stu left join score s on stu.stu_id = s.stu_id
group by stu.stu_id, stu.stu_name having nvl(avg(s.score),0) > 85 order by stu.stu_id
-- 查詢學生的選課情況:學號,姓名,課程號,課程名稱
select student.stu_id,student.stu_name,c.course_id,c.course_name from student
right join score s on student.stu_id = s.stu_id
left join course c on s.course_id = c.course_id
-- 查詢學生的沒有選課情況:學號,姓名
select stu_id,stu_name from
(
select student.stu_id,student.stu_name, s.course_id from student
left join score s on student.stu_id = s.stu_id
left join course c on s.course_id = c.course_id
) t where course_id is null
-- 查詢出每門課程的及格人數和不及格人數
select c.course_id,course_name,pass,fail
from course c join
(
select
course_id,sum(if(score >= 60,1,0)) as `pass`, sum(if(score < 60,1,0)) as `fail`
from score group by course_id
) t on c.course_id = t.course_id
-- 查詢課程編號為03且課程成績在80分以上的學生的學號和姓名及課程信息
select t1.stu_id,s.stu_name,t1.course_id,c.course_name,t1.score from
(select * from score where course_id = '03' and score > 80) t1
left join student s on s.stu_id = t1.stu_id
left join course c on t1.course_id = c.course_id
-- 查詢語文成績低於平均分數的學生是誰,教師是誰
select t3.stu_id,t3.stu_name,t3.`avg_score`,t.tea_name from
(select t2.stu_id,t2.`avg_score`,s.stu_name,t2.course_id,c.tea_id from
(select t1.stu_id,t1.course_id,t1.`avg_score` from
(select stu_id,s.course_id, avg(score) as `avg_score` from score s right join
(select course_id from course where course_name = '語文') t1 on t1.course_id = s.course_id
group by stu_id,s.course_id) t1
where t1.`avg_score` < (select avg(score) as `avg_score` from score s right join (select course_id from course where course_name = '語文') t1 on t1.course_id = s.course_id)
) t2 left join student s on t2.stu_id = s.stu_id
left join course c on t2.course_id = c.course_id
)t3 left join teacher t on t3.tea_id = t.tea_id;
-- 查詢所有學生總成績和平均成績,
-- 且他們的總成績低於平均成績的有多少個人,
-- 高於平均成績的有多少人,
-- 低於平均成績的男生和女生分別有多少人,
-- 且他們的任課老師是誰。
-- 統計各科成績各分數段人數:課程編號,課程名稱,[100-85],[85-70],[70-60],[0-60]及所占百分比
-- 方法一
select course_id,
concat(round((sum(`if`(score >= 85, 1, 0)) / count(*)) * 100, 2), '%') as `a`,
concat(round((sum(`if`(score < 85, `if`(score >= 70, 1, 0), 0)) / count(*)) * 100, 2), '%') as `b`,
concat(round((sum(`if`(score < 70, `if`(score >= 60, 1, 0), 0)) / count(*)) * 100, 2), '%') as `c`,
concat(round((sum(`if`(score < 60, 1, 0)) / count(*)) * 100, 2), '%') as `d`
from score group by course_id;
-- 方法二
select course_id,
concat(round((sum(`if`(score >= 85, 1, 0)) / count(*)) * 100, 2), '%') as `a`,
concat(round((sum(`if`(score between 70 and 84, 1, 0)) / count(*)) * 100, 2), '%') as `b`,
concat(round((sum(`if`(score between 60 and 74, 1, 0)) / count(*)) * 100, 2), '%') as `c`,
concat(round((sum(`if`(score < 60, 1, 0)) / count(*)) * 100, 2), '%') as `d`
from score group by course_id;
-- 查詢各科成績最高分、最低分和平均分,以如下形式顯示:
-- 課程ID,課程name,最高分,最低分,平均分,及格率,中等率,優良率,優秀率
-- 及格為>=60,中等為:70-80,優良為:80-90,優秀為:>=90
select c.course_id as `課程ID`,
c.course_name as `課程name`,
max(score) as `最高分`,
min(score) as `最低分`,
round(avg(score), 2) as `平均分`,
concat(round(sum(`if`(score >= 60, 1, 0)) / count(*) * 100, 2), '%') as `及格率`,
concat(round(sum(if(score between 70 and 79, 1, 0)) / count(*) * 100, 2), '%') as `中等率`,
concat(round(sum(if(score between 80 and 89, 1, 0)) / count(*) * 100, 2), '%') as `優良率`,
concat(round(sum(`if`(score >= 90, 1, 0)) / count(*) * 100, 2), '%') as `優秀率`
from course c left join score s on c.course_id = s.course_id
group by c.course_id, c.course_name;
-- 查詢每門課程的教師學生有誰,男生和女生的比例是多少,
select t1.course_id,t1.gender,concat(round((t1.count_gender / t2.count_course_student) * 100,2),'%') as `proportion` from
(
select c.course_id, stu.gender,count(stu.gender) as `count_gender`
from course c left join score s on c.course_id = s.course_id left join student stu on s.stu_id = stu.stu_id
group by c.course_id, stu.gender
) t1
join
(
select c.course_id, count(*) as `count_course_student`
from course c left join score s on c.course_id = s.course_id left join student stu on s.stu_id = stu.stu_id
group by c.course_id
) t2 on t1.course_id = t2.course_id
join score s on t1.course_id = s.course_id
-- 且他們的每門學科的成績是男生比較優一些還是女生比較優一些,並且每門課程的最高分是誰。
select s.course_id,max(s.score) as `max_score`,min(s.score) as `min_score` from course join score s on course.course_id = s.course_id group by s.course_id
-- 課程編號為"01"且課程分數小於60,按分數降序排列的學生信息
select s.stu_id, stu.stu_name, stu.birthday, stu.gender,s.score
from score s join student stu on s.stu_id = stu.stu_id
where s.score < 60 order by s.score desc
-- 查詢所有課程成績在70分以上的學生的姓名、課程名稱和分數,按分數升序
select stu.stu_name, c.course_name, s2.score
from student stu join
(select s.stu_id, sum(`if`(s.score >= 70, 0, 1)) as `is_ok` from score s group by s.stu_id having is_ok = 0) t1
on stu.stu_id = t1.stu_id left join score s2 on stu.stu_id = s2.stu_id left join course c on s2.course_id = c.course_id
order by s2.score
-- 查詢某學生不同課程的成績相同的學生編號、課程編號、學生成績
select s1.stu_id,collect_list(s1.course_id) as course_id,collect_set(s1.score) as score
from score s1 join score s2 on s1.stu_id = s2.stu_id
and s1.course_id != s2.course_id
and s1.score == s2.score
group by s1.stu_id
-- 查詢語文成績低於平均分數的學生是誰,教師是誰
select stu.stu_name,tea_name from student stu left join score s left join course c left join teacher t where c.course_nam = "語文" and s.
-- 結合字元串函數 時間函數 流程式控制制函數 計算身份證信息
-- 根據身份證號 判斷性別 身份證號 第十七位 奇數為男性 偶數為女性
select id_card, if(mod(substr(id_card,17,1),2) = 1,'精神小伙兒','扒蒜老妹兒') gender from person;
-- 根據身份證號 找出所有男性信息
select *, mod(substr(id_card,17,1),2) gender from person where mod(substr(id_card,17,1),2) = 1;
-- 根據身份證號 計算男性人數和女性人數
select
if(mod(substr(id_card,17,1),2) = 1,'精神小伙兒','扒蒜老妹兒') gender ,
count(*) gender_count
from person group by mod(substr(id_card,17,1),2) limit 10;
-- 根據身份證號 計算生日排序
select
date_format(from_unixtime( unix_timestamp(substr(id_card,7,8),'yyyyMMdd')),'yyyy-MM-dd') as `birthday`
from person
order by unix_timestamp(`birthday`,'yyyy-MM-dd') desc
-- 根據身份證號 計算年齡
-- 1、當前月份-出生月份 > 0 說明 已經過完生日 及 使用 當前年份 - 出生年份 = 年齡
-- 2、當前月份-出生月份 < 0 說明 未過生日 及 使用 當前年份 - 出生年份 -1 = 年齡
-- 3、當前月份-出生月份 = 0
-- 3.1、當前日-出生日 > 0 說明 已經過完生日 及 使用 當前年份 - 出生年份 = 年齡
-- 3.2、當前日-出生日 < 0 說明 未過生日 及 使用 當前年份 - 出生年份 -1 = 年齡
-- 3.3、當前日-出生日 = 0 說明 生日視作過完了 及 使用 當前年份 - 出生年份 = 年齡
select if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
if(month(`current_date`()) - month(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) > 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))),
if(day(`current_date`()) - day(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) < 0,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd'))) - 1,
year(`current_date`()) - year(from_unixtime(unix_timestamp(substr(id_card, 7, 8), 'yyyyMMdd')))
)
)
)
) as `age`
from person;
高級聚合函數
分組排序取TopN
-- row_number() over () 連續序號
-- over()裡頭的分組以及排序的執行晚於 where 、group by、order by 的執行。
-- partition by 分區
select *,row_number() over () as `num` from score;
-- 查詢各科成績前三名的學生
SELECT a.stu_id,a.course_id,a.score
FROM score a
LEFT JOIN score b ON a.course_id = b.course_id
AND a.score <= b.score
GROUP BY a.stu_id,a.course_id,a.score
HAVING COUNT( b.stu_id ) <= 3
ORDER BY a.course_id,a.score DESC;
SELECT S1.course_id,s1.stu_id,s1.score FROM score s1
WHERE (
SELECT COUNT(*) FROM score s2
WHERE s2.course_id=s1.course_id AND s2.score > s1.score
) < 3 ORDER BY s1.course_id,s1.score DESC;
select * from
(
select course_id,stu_id,score,
row_number() over (partition by course_id order by score desc ) as `num`
from score
) t where t.num <= 3;
-- rank() over () 排名 跳躍排序 序號不是連續的
select * from
(
select course_id,stu_id,score,
rank() over (partition by course_id order by score desc ) as `ranking`
from score
) t;
-- dense_rank() over () 排名 連續排序
select * from
(
select course_id,stu_id,score,
dense_rank() over (partition by course_id order by score desc ) as `ranking`
from score
) t;
行列轉換
-- 行轉列
-- collect_list 行轉列 有序可重覆 結果是個集合
select collect_list(emp_job) as `job` from employee;
-- collect_set 行轉列 過濾重覆 結果是個集合
select collect_list(emp_job) as `job` from employee;
-- concat_ws 把集合轉字元串
concat_ws('分隔符',集合)
select concat_ws(',',collect_set(emp_job)) as `job` from emp;
-- split 把字元串轉為集合
concat_ws(字元串,'分隔符')
select split(concat_ws(',',collect_set(emp_job)))as `job` from emp;
-- 列轉行
UDF,即用戶定義函數(user-defined function),作用於單行數據,並且產生一個數據行作為輸出。
Hive中大多數函數都屬於這一類,比如數學函數和字元串函數。UDF函數的輸入與輸出值是1:1關係。
UDTF,即用戶定義表生成函數(user-defined table-generating function),-- 又稱炸裂函數
作用於單行數據,並且產生多個數據行。UDTF函數的輸入與輸出值是1:n的關係。UDAF,用戶定義聚集函數(user-defined aggregate function),作用於多行數據,產生一個輸出數據行。
Hive中像COUNT、MAX、MIN和SUM這樣的函數就是聚集函數。UDAF函數的輸入與輸出值是n:1的關係。
雇員表 employee.csv
hdfs dfs -mkdir -p /quiz04/employee
hdfs dfs -put /root/employee.csv /quiz04/employee
create external table employee(
name string comment '姓名',
sex string comment '性別',
birthday string comment '出生年月',
hiredate string comment '入職日期',
job string comment '崗位',
salary int comment '薪資',
bonus int comment '獎金',
friends array<string> comment '朋友',
children map<string,int> comment '孩子'
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n'
stored as textfile
location '/quiz04/employee';
load data local inpath '/root/employee.csv' into table employee;
UDTF
-- explode
select explode(array('java','python','scala','go')) as course;
select explode(map('name','李昊哲','gender','1')) as (key,value);
-- posexplode
select posexplode(array('java','python','scala','go')) as (pos,course);
-- inline
select inline(array(named_struct('id',1,'name','李昊哲','gender','1'),
named_struct('id',2,'name','李哲','gender','0'),
named_struct('id',3,'name','李大寶','gender','1'))) as (id,name,gender);
-- lateral view
select * from employee lateral view explode(friends) t as friend;
select e.name,e.friends,t1.friend from employee e lateral view explode(friends) t1 as `friend`;
select * from employee e lateral view explode(children) t1 as `children_name`,`children_friend_count`;
select e.name,e.children,t1.children_name,t1.nvl(t2.children_friend_count,0) from employee e
lateral view explode(children) t1 as `children_name`,`children_friend_count`;
select e.name,e.friends,e.children,t1.friend,t2.children_name,nvl(t2.children_friend_count,0) from employee e
lateral view explode(friends) t1 as `friend`
lateral view explode(children) t2 as `children_name`,`children_friend_count`;
-- lateral view outer
電影表 movie.txt
hdfs dfs -mkdir -p /quiz04/movie
hdfs dfs -put /root/movie.txt /quiz04/movie
create external table movie(
name string comment '電影名稱',
category string comment '電影分類'
)
row format delimited fields terminated by '-'
lines terminated by '\n'
stored as textfile
location '/quiz04/movie';
load data local inpath '/root/movie.txt' into table movie;
UDTF 案例
-- 根據上述電影信息表,統計各分類的電影數量
select cate,count(name) as `quantity` from movie
lateral view explode(split(category,',')) tmp as cate
group by cate;
分組和去重
-- 統計崗位數量
select count(distinct emp_job) from emp;
select count(*) from (select emp_job from emp group by emp_job) t;
開窗函數
能為每行數據劃分一個視窗,然後對視窗範圍內的數據進行計算,最後將計算結果返回給該行
什麼是開窗函數,
開窗函數(Window Function)是在 SQL 中用於對分組數據執行聚合函數過程的函數。開窗函數可以將結果分成分組集合,並逐個分組進行計算,而不是標準聚合函數計算全部數據。
開窗函數可以按照視窗大小
(範圍)對行數據進行分組,並對每個子組執行聚合計算。它們在 SELECT 和 ORDER BY 子句中使用,且不能單獨使用。在使用開窗函數時需要與 OVER 子句一起使用,以便定義子分組。 下麵是一些常見的開窗函數:
ROW_NUMBER()
:分配連續的整數行號給查詢結果集中的每一行。ROWNUM
:Oracle 中類似 ROW_NUMBER() 函數的行號函數,不過在語法上有所不同。RANK()
:計算等級。相同數據的等級是一樣的,假如有3個人考了同樣的分數,他們的排名應該是併列第一,第四個人的排名則應是第四。DENSE_RANK()
:計算等級,此函數不會像前面的 RANK 函數一樣跳過重覆項,而是把他們放在同一個等級里。LEAD( )
:返回當前行後的指定行數的值。LAG()
: 返回當前行前的指定行數的值。FIRST_VALUE()
:返回視窗或分組的第一行對應的值。LAST_VALUE()
:返回視窗或分組的最後一行對應的值。SUM() OVER()
:計算視窗或分組的總和。AVG() OVER()
:計算視窗或分組的平均值。MIN() OVER()
:計算視窗或分組的最小值MAX() OVER()
:計算視窗或分組的最大值。
Function(arg1,..., argn) OVER ([PARTITION BY <...>] [ORDER BY <....>] [<window_expression>])
-- 其中Function(arg1,..., argn) 可以是下麵分類中的任意一個
-- 聚合函數:比如sum max min avg count等
-- 排序函數:比如row_number rank dense_rank等
-- 分析函數:比如lead lag first_value last_value等
-- OVER [PARTITION BY <...>] 類似於group by 用於指定分組 每個分組你可以把它叫做視窗
-- 如果沒有PARTITION BY 那麼整張表的所有行就是一組
-- [ORDER BY <....>] 用於指定每個分組內的數據排序規則 支持ASC、DESC
-- [<window_expression>] 用於指定每個視窗中 操作的數據範圍 預設是視窗中所有行
hdfs dfs -mkdir /quiz04/order
hdfs dfs -put /root/order.csv /quiz04/order
create external table `order`
(
order_id string comment '訂單id',
user_id string comment '用戶id',
user_name string comment '用戶姓名',
order_date string comment '下單日期',
order_amount int comment '訂單金額'
)
row format delimited fields terminated by ','
lines terminated by '\n'
stored as textfile
location '/quiz04/order';
聚合函數
rows 基於行
range 基於值
函數() over(rows between and 3)