[Author]: kwu
从关系库导入数据到hive-hbase表中,关系库到hbase中,可以直接由sqoop来导入,但是我们要对rowkey做特殊处理并加入更新时间,则操作步骤如下:
1、创建hive与hbase的表
1)创建hbase表
命令行输入 hbase shell 进入hbase的终端:
[mw_shl_code=bash,true]create 'location','cf1'
[/mw_shl_code]
2)创建hive的外表
[mw_shl_code=bash,true]hive -e"
drop TABLE ods.hbase_location;
CREATE EXTERNAL TABLE ods.hbase_location(key string ,
ID int comment '唯一ID',
location1 string comment '国家代号' ,
location2 string comment '省份代号' ,
location3 string comment '城市代号',
country string comment '国家(中文)',
cn string comment '省份(中文)',
cn_city string comment '城市(中文)',
cn_district string comment '城市所属地区(中文)',
py string comment '省份缩略(拼音)',
en_city string comment '城市(英文)',
en_district string comment '城市所属地区(英文)',
en string comment '省份(英文)',
geoid INT comment '行政区划代码',
updatetime string comment '更新时间'
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,
cf1:ID,
cf1:location1,
cf1:location2,
cf1:location3,
cf1:country,
cf1:cn,
cf1:cn_city,
cf1:cn_district,
cf1:py,
cf1:en_city,
cf1:en_district,
cf1:en,
cf1:geoid,
cf1:updatetime")
TBLPROPERTIES("hbase.table.name" = "location"); "[/mw_shl_code]
2、自动化的shell脚本
[mw_shl_code=bash,true]#!/bin/sh
# upload logs to hdfs
today=`date --date='0 days ago' +%Y-%m-%d`
hive -e"
drop table stage.location_tmp;
CREATE TABLE
stage.location_tmp
(
ID INT comment '唯一ID',
location1 string comment '国家代号' ,
location2 string comment '省份代号' ,
location3 string comment '城市代号',
country string comment '国家(中文)',
cn string comment '省份(中文)',
cn_city string comment '城市(中文)',
cn_district string comment '城市所属地区(中文)',
py string comment '省份缩略(拼音)',
en_city string comment '城市(英文)',
en_district string comment '城市所属地区(英文)',
en string comment '省份(英文)',
geoid INT comment '行政区划代码'
)
ROW FORMAT DELIMITED fields terminated by '\001'
STORED AS TEXTFILE;
"
sqoop import --connect jdbc:mysql://10.130.2.6:3306/bdc_test --username lvwenjuan --password Abcd1234 --table location --where "1=1" --columns "ID ,location1 ,location2 ,location3 ,country ,cn ,cn_city , cn_district ,py , en_city , en_district ,en , geoid " --fields-terminated-by '\001' --hive-import --hive-drop-import-delims --hive-table stage.location_tmp -m 1
hive -e"
insert into table ods.hbase_location select reverse(ID) ,
ID ,
location1 ,
location2 ,
location3 ,
country,
cn,
cn_city,
cn_district,
py,
en_city,
en_district,
en string,
geoid ,
udf_getbfhourstime(0) from stage.location_tmp;"
hive -e "drop TABLE stage.location_tmp;"[/mw_shl_code]
3、说明
1) stage.location_tmp为临时中转表,本次ETL完后即删除。
2)--where "1=1" 可设置关系库的查询语句
3)reverse(ID) 对hbase的rowkey自动逆序处理
4)insert into 到hbase中自动根据rowkey来去重
5)udf_getbfhourstime(0) 自定义函数,取的是当前时间
|
|