一、安裝datax 通過https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202210/datax.tar.gz這個地址下載datax.tar.gz,解壓到某個目錄,如我的目錄/opt/conf/datax-20230301,解壓完後會在當前 ...
一、安裝datax
通過https://datax-opensource.oss-cn-hangzhou.aliyuncs.com/202210/datax.tar.gz這個地址下載datax.tar.gz,解壓到某個目錄,如我的目錄/opt/conf/datax-20230301,解壓完後會在當前目錄下生成datax目錄,進入datax目錄後的目錄如下圖所示:
之後在datax安裝目錄下,運行以下命令,賦予執行許可權。
sudo chmod -R 755 ./*
二、測試datax是否正確安裝
/opt/conf/datax-20230301/datax/bin/datax.py /opt/conf/datax-20230301/datax/job/job.json
運行以上命令,看是否能正確啟動,啟動後運行完結果如下圖:
如果那個正確運行,說明/opt/conf/datax-20230301/datax/bin/datax.py這個文件的編碼不是utf-8,需要重新編碼。用我這個替換一下即可正常使用。
datax.py
三、編寫配置文件
在datax安裝目錄下的job文件夾,使用以下命令新建配置文件
vim job_air_data_source_mysql_hdfs.json
之後將下麵的json文件內容拷貝粘貼到剛纔打開的文件,保存即可。
{
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": "root",
"password": "root",
"column": ["*"],
"splitPk": "id",
"connection": [
{
"table": [
"air_data_source"
],
"jdbcUrl": [
"jdbc:mysql://master:3306/air_data"
]
}
]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://master:9820",
"fileType": "TEXT",
"path": "/user/hive/warehouse/air_data.db/air_data_source",
"fileName": "air_data_source_202302",
"column": [
{"name": " id ","type": "STRING"},
{"name": " airlinelogo ","type": "STRING"},
{"name": " airlineshortcompany ","type": "STRING"},
{"name": " arractcross ","type": "STRING"},
{"name": " arracttime ","type": "STRING"},
{"name": " arrairport ","type": "STRING"},
{"name": " arrcode ","type": "STRING"},
{"name": " arrontimerate ","type": "STRING"},
{"name": " arrplancross ","type": "STRING"},
{"name": " arrplantime ","type": "STRING"},
{"name": " arrterminal ","type": "STRING"},
{"name": " checkintable ","type": "STRING"},
{"name": " checkintablewidth ","type": "STRING"},
{"name": " depactcross ","type": "STRING"},
{"name": " depacttime ","type": "STRING"},
{"name": " depairport ","type": "STRING"},
{"name": " depcode ","type": "STRING"},
{"name": " depplancross ","type": "STRING"},
{"name": " depplantime ","type": "STRING"},
{"name": " depterminal ","type": "STRING"},
{"name": " flightno ","type": "STRING"},
{"name": " flightstate ","type": "STRING"},
{"name": " localdate ","type": "STRING"},
{"name": " mainflightno ","type": "STRING"},
{"name": " shareflag ","type": "STRING"},
{"name": " statecolor ","type": "STRING"}
],
"writeMode": "truncate",
"fieldDelimiter": "\u0001",
"compress":"GZIP"
}
}
}
]
}
}
四、Hive建資料庫、數據表
create database air_data;
use air_data;
CREATE TABLE `air_data_source`(
`id` int COMMENT '主鍵',
`airlinelogo` string COMMENT '航空公司logo',
`airlineshortcompany` string COMMENT '航空公司簡稱',
`arractcross` string,
`arracttime` string COMMENT '實際起飛時間',
`arrairport` string,
`arrcode` string,
`arrontimerate` string COMMENT '到達準點率',
`arrplancross` string,
`arrplantime` string COMMENT '計划到達時間',
`arrterminal` string,
`checkintable` string,
`checkintablewidth` string,
`depactcross` string,
`depacttime` string COMMENT '實際到達時間',
`depairport` string COMMENT '到達機場名稱',
`depcode` string COMMENT '到達機場代碼',
`depplancross` string,
`depplantime` string COMMENT '計划起飛時間',
`depterminal` string,
`flightno` string COMMENT '航班號',
`flightstate` string COMMENT '航班狀態',
`localdate` string,
`mainflightno` string,
`shareflag` string,
`statecolor` string)
COMMENT '航空數據原始表'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
;
運行完以上任務後,接著可以進行數據抽取了。
四、運行任務
在當前目錄下執行以下命令:
/opt/conf/datax-20230301/datax/bin/datax.py /opt/conf/datax-20230301/datax/job/job_air_data_source_mysql_hdfs.json
即可正確啟動數據同步任務,運行完結果如下:
查看HDFS上是否已經有了數據文件,運行一下命令,得到輸出。
hadoop fs -ls hdfs://master:9820/user/hive/warehouse/air_data.db/air_data_source
至此,利用datax將mysql數據同步到hdfs任務已配置完成。