本帖最后由 hanyunsong 于 2020-9-18 10:57 编辑
问题导读:
1. 拉链表是什么?
2. 为什么要做拉链表?
3. 拉链表的制作过程是怎么样的?
上一篇:企业级数据仓库构建(七):搭建DWD 层
1.1 DWD 层(业务数据)
1.1.1 商品维度表(全量表)
1)建表语句
- DROP TABLE IF EXISTS `dwd_dim_sku_info`;
- CREATE EXTERNAL TABLE `dwd_dim_sku_info` (
- `id` string COMMENT '商品 id',
- `spu_id` string COMMENT 'spuid',
- `price` double COMMENT '商品价格',
- `sku_name` string COMMENT '商品名称',
- `sku_desc` string COMMENT '商品描述',
- `weight` double COMMENT '重量',
- `tm_id` string COMMENT '品牌 id',
- `tm_name` string COMMENT '品牌名称',
- `category3_id` string COMMENT '三级分类 id',
- `category2_id` string COMMENT '二级分类 id',
- `category1_id` string COMMENT '一级分类 id',
- `category3_name` string COMMENT '三级分类名称',
- `category2_name` string COMMENT '二级分类名称',
- `category1_name` string COMMENT '一级分类名称',
- `spu_name` string COMMENT 'spu 名称',
- `create_time` string COMMENT '创建时间'
- )
- COMMENT '商品维度表'
- PARTITIONED BY (`dt` string)
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_sku_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_dim_sku_info partition(dt='2020-03-10')
- select
- sku.id,
- sku.spu_id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.tm_id,
- ob.tm_name,
- sku.category3_id,
- c2.id category2_id,
- c1.id category1_id,
- c3.name category3_name,
- c2.name category2_name,
- c1.name category1_name,
- spu.spu_name,
- sku.create_time
- from
- (
- select * from ods_sku_info where dt='2020-03-10'
- )sku
- join
- (
- select * from ods_base_trademark where dt='2020-03-10'
- )ob on sku.tm_id=ob.tm_id
- join
- (
- select * from ods_spu_info where dt='2020-03-10'
- )spu on spu.id = sku.spu_id
- join
- (
- select * from ods_base_category3 where dt='2020-03-10'
- )c3 on sku.category3_id=c3.id
- join
- (
- select * from ods_base_category2 where dt='2020-03-10'
- )c2 on c3.category2_id=c2.id
- join
- (
- select * from ods_base_category1 where dt='2020-03-10'
- )c1 on c2.category1_id=c1.id;
复制代码
3)查询加载结果
- select * from dwd_dim_sku_info where dt='2020-03-10';
复制代码
1.1.2 优惠券信息表(全量)
把 ODS 层 ods_coupon_info 表数据导入到 DWD 层优惠卷信息表,在导入过程中可以做适当的清洗
1)建表语句
- drop table if exists dwd_dim_coupon_info;
- create external table dwd_dim_coupon_info(
- `id` string COMMENT '购物券编号',
- `coupon_name` string COMMENT '购物券名称',
- `coupon_type` string COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
- `condition_amount` string COMMENT '满额数',
- `condition_num` string COMMENT '满件数',
- `activity_id` string COMMENT '活动编号',
- `benefit_amount` string COMMENT '减金额',
- `benefit_discount` string COMMENT '折扣',
- `create_time` string COMMENT '创建时间',
- `range_type` string COMMENT '范围类型 1、商品 2、品类 3、品牌',
- `spu_id` string COMMENT '商品 id',
- `tm_id` string COMMENT '品牌 id',
- `category3_id` string COMMENT '品类 id',
- `limit_num` string COMMENT '最多领用次数',
- `operate_time` string COMMENT '修改时间',
- `expire_time` string COMMENT '过期时间'
- ) COMMENT '优惠券信息表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_dim_coupon_info partition(dt='2020-03-10')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- spu_id,
- tm_id,
- category3_id,
- limit_num,
- operate_time,
- expire_time
- from ods_coupon_info
- where dt='2020-03-10';
复制代码
3)查询加载结果
- select * from dwd_dim_coupon_info where dt='2020-03-10';
复制代码
1.1.3 活动维度表(全量)
1)建表语句
- drop table if exists dwd_dim_activity_info;
- create external table dwd_dim_activity_info(
- `id` string COMMENT '编号',
- `activity_name` string COMMENT '活动名称',
- `activity_type` string COMMENT '活动类型',
- `condition_amount` string COMMENT '满减金额',
- `condition_num` string COMMENT '满减件数',
- `benefit_amount` string COMMENT '优惠金额',
- `benefit_discount` string COMMENT '优惠折扣',
- `benefit_level` string COMMENT '优惠级别',
- `start_time` string COMMENT '开始时间',
- `end_time` string COMMENT '结束时间',
- `create_time` string COMMENT '创建时间'
- ) COMMENT '活动信息表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_activity_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_dim_activity_info partition(dt='2020-03-10')
- select
- info.id,
- info.activity_name,
- info.activity_type,
- rule.condition_amount,
- rule.condition_num,
- rule.benefit_amount,
- rule.benefit_discount,
- rule.benefit_level,
- info.start_time,
- info.end_time,
- info.create_time
- from
- (
- select * from ods_activity_info where dt='2020-03-10'
- )info
- left join
- (
- select * from ods_activity_rule where dt='2020-03-10'
- )rule on info.id = rule.activity_id;
复制代码
3)查询加载结果
- select * from dwd_dim_activity_info where dt='2020-03-10';
复制代码
1.1.4 地区维度表(特殊)
1)建表语句
- DROP TABLE IF EXISTS `dwd_dim_base_province`;
- CREATE EXTERNAL TABLE `dwd_dim_base_province` (
- `id` string COMMENT 'id',
- `province_name` string COMMENT '省市名称',
- `area_code` string COMMENT '地区编码',
- `iso_code` string COMMENT 'ISO 编码',
- `region_id` string COMMENT '地区 id',
- `region_name` string COMMENT '地区名称'
- )
- COMMENT '地区省市表'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_base_province/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_dim_base_province
- select
- bp.id,
- bp.name,
- bp.area_code,
- bp.iso_code,
- bp.region_id,
- br.region_name
- from ods_base_province bp
- join ods_base_region br
- on bp.region_id=br.id;
复制代码
1.1.5 时间维度表(特殊)(预留)
1)建表语句
- DROP TABLE IF EXISTS `dwd_dim_date_info`;
- CREATE EXTERNAL TABLE `dwd_dim_date_info`(
- `date_id` string COMMENT '日',
- `week_id` int COMMENT '周',
- `week_day` int COMMENT '周的第几天',
- `day` int COMMENT '每月的第几天',
- `month` int COMMENT '第几月',
- `quarter` int COMMENT '第几季度',
- `year` int COMMENT '年',
- `is_workday` int COMMENT '是否是周末',
- `holiday_id` int COMMENT '是否是节假日'
- )
- row format delimited fields terminated by '\t'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_date_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)把 date_info.txt 文件上传到 node01 的 /opt/modules/db_log/路径
3)数据装载
- load data local inpath '/opt/modules/db_log/date_info.txt' into table dwd_dim_date_info;
复制代码
4)查询加载结果
- select * from dwd_dim_date_info;
复制代码
1.1.6 订单明细事实表(事务型快照事实表)
1)建表语句
- drop table if exists dwd_fact_order_detail;
- create external table dwd_fact_order_detail (
- `id` string COMMENT '订单编号',
- `order_id` string COMMENT '订单号',
- `user_id` string COMMENT '用户 id',
- `sku_id` string COMMENT 'sku 商品 id',
- `sku_name` string COMMENT '商品名称',
- `order_price` decimal(10,2) COMMENT '商品价格',
- `sku_num` bigint COMMENT '商品数量',
- `create_time` string COMMENT '创建时间',
- `province_id` string COMMENT '省份 ID',
- `total_amount` decimal(20,2) COMMENT '订单总金额'
- )
- PARTITIONED BY (`dt` string)
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_fact_order_detail/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_fact_order_detail partition(dt='2020-03-10')
- select
- od.id,
- od.order_id,
- od.user_id,
- od.sku_id,
- od.sku_name,
- od.order_price,
- od.sku_num,
- od.create_time,
- oi.province_id,
- od.order_price*od.sku_num
- from
- (
- select * from ods_order_detail where dt='2020-03-10'
- ) od
- join
- (
- select * from ods_order_info where dt='2020-03-10'
- ) oi
- on od.order_id=oi.id;
复制代码
3)查询加载结果
- select * from dwd_fact_order_detail where dt='2020-03-10';
复制代码
1.1.7 支付事实表(事务型快照事实表)
1)建表语句
- drop table if exists dwd_fact_payment_info;
- create external table dwd_fact_payment_info (
- `id` string COMMENT '',
- `out_trade_no` string COMMENT '对外业务编号',
- `order_id` string COMMENT '订单编号',
- `user_id` string COMMENT '用户编号',
- `alipay_trade_no` string COMMENT '支付宝交易流水编号',
- `payment_amount` decimal(16,2) COMMENT '支付金额',
- `subject` string COMMENT '交易内容',
- `payment_type` string COMMENT '支付类型',
- `payment_time` string COMMENT '支付时间',
- `province_id` string COMMENT '省份 ID'
- )
- PARTITIONED BY (`dt` string)
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_fact_payment_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)数据装载
- insert overwrite table dwd_fact_payment_info partition(dt='2020-03-10')
- select
- pi.id,
- pi.out_trade_no,
- pi.order_id,
- pi.user_id,
- pi.alipay_trade_no,
- pi.total_amount,
- pi.subject,
- pi.payment_type,
- pi.payment_time,
- oi.province_id
- from
- (
- select * from ods_payment_info where dt='2020-03-10'
- )pi
- join
- (
- select id, province_id from ods_order_info where dt='2020-03-10'
- )oi
- on pi.order_id = oi.id;
复制代码
3)查询加载结果
- select * from dwd_fact_payment_info where dt='2020-03-10';
复制代码
1.1.8 退款事实表(事务型快照事实表)
把 ODS 层 ods_order_refund_info 表数据导入到 DWD 层退款事实表,在导入过程中可以做适当的清洗
1)建表语句
- drop table if exists dwd_fact_order_refund_info;
- create external table dwd_fact_order_refund_info(
- `id` string COMMENT '编号',
- `user_id` string COMMENT '用户 ID',
- `order_id` string COMMENT '订单 ID',
- `sku_id` string COMMENT '商品 ID',
- `refund_type` string COMMENT '退款类型',
- `refund_num` bigint COMMENT '退款件数',
- `refund_amount` decimal(16,2) COMMENT '退款金额',
- `refund_reason_type` string COMMENT '退款原因类型',
- `create_time` string COMMENT '退款时间'
- ) COMMENT '退款事实表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/';
复制代码
2)数据装载
- insert overwrite table dwd_fact_order_refund_info partition(dt='2020-03-10')
- select
- id,
- user_id,
- order_id,
- sku_id,
- refund_type,
- refund_num,
- refund_amount,
- refund_reason_type,
- create_time
- from ods_order_refund_info
- where dt='2020-03-10';
复制代码
3)查询加载结果
- select * from dwd_fact_order_refund_info where dt='2020-03-10';
复制代码
1.1.9 评价事实表(事务型快照事实表)
把 ODS 层 ods_comment_info 表数据导入到 DWD 层评价事实表,在导入过程中可以做适当的清洗
1)建表语句
- drop table if exists dwd_fact_comment_info;
- create external table dwd_fact_comment_info(
- `id` string COMMENT '编号',
- `user_id` string COMMENT '用户 ID',
- `sku_id` string COMMENT '商品 sku',
- `spu_id` string COMMENT '商品 spu',
- `order_id` string COMMENT '订单 ID',
- `appraise` string COMMENT '评价',
- `create_time` string COMMENT '评价时间'
- ) COMMENT '评价事实表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- location '/warehouse/gmall/dwd/dwd_fact_comment_info/';
复制代码
2)数据装载
- insert overwrite table dwd_fact_comment_info partition(dt='2020-03-10')
- select
- id,
- user_id,
- sku_id,
- spu_id,
- order_id,
- appraise,
- create_time
- from ods_comment_info
- where dt='2020-03-10';
复制代码
3)查询加载结果
- select * from dwd_fact_comment_info where dt='2020-03-10';
复制代码
1.1.10 加购事实表(周期型快照事实表,每日快照)
由于购物车的数量是会发生变化,所以导增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
周期型快照事实表劣势:存储的数据量会比较大
解决方案:周期型快照事实表存储的数据比较讲究时效性,时间太久了的意义不大,可以删除以前的数据
1)建表语句
- drop table if exists dwd_fact_cart_info;
- create external table dwd_fact_cart_info(
- `id` string COMMENT '编号',
- `user_id` string COMMENT '用户 id',
- `sku_id` string COMMENT 'skuid',
- `cart_price` string COMMENT '放入购物车时价格',
- `sku_num` string COMMENT '数量',
- `sku_name` string COMMENT 'sku 名称 (冗余)',
- `create_time` string COMMENT '创建时间',
- `operate_time` string COMMENT '修改时间',
- `is_ordered` string COMMENT '是否已经下单。1 为已下单;0 为未下单',
- `order_time` string COMMENT '下单时间'
- ) COMMENT '加购事实表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- location '/warehouse/gmall/dwd/dwd_fact_cart_info/';
复制代码
2)数据装载
- insert overwrite table dwd_fact_cart_info partition(dt='2020-03-10')
- select
- id,
- user_id,
- sku_id,
- cart_price,
- sku_num,
- sku_name,
- create_time,
- operate_time,
- is_ordered,
- order_time
- from ods_cart_info
- where dt='2020-03-10';
复制代码
3)查询加载结果
- select * from dwd_fact_cart_info where dt='2020-03-10';
复制代码
1.1.11 收藏事实表(周期型快照事实表,每日快照)
收藏的标记,是否取消,会发生变化,做增量不合适
每天做一次快照,导入的数据是全量,区别于事务型事实表是每天导入新增
1)建表语句
- drop table if exists dwd_fact_favor_info;
- create external table dwd_fact_favor_info(
- `id` string COMMENT '编号',
- `user_id` string COMMENT '用户 id',
- `sku_id` string COMMENT 'skuid',
- `spu_id` string COMMENT 'spuid',
- `is_cancel` string COMMENT '是否取消',
- `create_time` string COMMENT '收藏时间',
- `cancel_time` string COMMENT '取消时间'
- ) COMMENT '收藏事实表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- location '/warehouse/gmall/dwd/dwd_fact_favor_info/';
复制代码
2)数据装载
- insert overwrite table dwd_fact_favor_info partition(dt='2020-03-10')
- select
- id,
- user_id,
- sku_id,
- spu_id,
- is_cancel,
- create_time,
- cancel_time
- from ods_favor_info
- where dt='2020-03-10';
复制代码
3)查询加载结果
- select * from dwd_fact_favor_info where dt='2020-03-10';
复制代码
1.1.12 优惠券领用事实表(累积型快照事实表)
优惠卷的生命周期:领取优惠卷-》用优惠卷下单-》优惠卷参与支付
累积型快照事实表使用:统计优惠卷领取次数、优惠卷下单次数、优惠卷参与支付次数
1)建表语句
- drop table if exists dwd_fact_coupon_use;
- create external table dwd_fact_coupon_use(
- `id` string COMMENT '编号',
- `coupon_id` string COMMENT '优惠券 ID',
- `user_id` string COMMENT 'userid',
- `order_id` string COMMENT '订单 id',
- `coupon_status` string COMMENT '优惠券状态',
- `get_time` string COMMENT '领取时间',
- `using_time` string COMMENT '使用时间(下单)',
- `used_time` string COMMENT '使用时间(支付)'
- ) COMMENT '优惠券领用事实表'
- PARTITIONED BY (`dt` string)
- row format delimited fields terminated by '\t'
- location '/warehouse/gmall/dwd/dwd_fact_coupon_use/';
复制代码
注意:dt 是按照优惠卷领用时间 get_time 做为分区
2)数据装载
- set hive.exec.dynamic.partition.mode=nonstrict;
- insert overwrite table dwd_fact_coupon_use partition(dt)
- select
- if(new.id is null,old.id,new.id),
- if(new.coupon_id is null,old.coupon_id,new.coupon_id),
- if(new.user_id is null,old.user_id,new.user_id),
- if(new.order_id is null,old.order_id,new.order_id),
- if(new.coupon_status is null,old.coupon_status,new.coupon_status),
- if(new.get_time is null,old.get_time,new.get_time),
- if(new.using_time is null,old.using_time,new.using_time),
- if(new.used_time is null,old.used_time,new.used_time),
- date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
- from
- (
- select
- id,
- coupon_id,
- user_id,
- order_id,
- coupon_status,
- get_time,
- using_time,
- used_time
- from dwd_fact_coupon_use
- where dt in
- (
- select
- date_format(get_time,'yyyy-MM-dd')
- from ods_coupon_use
- where dt='2020-03-10'
- )
- )old
- full outer join
- (
- select
- id,
- coupon_id,
- user_id,
- order_id,
- coupon_status,
- get_time,
- using_time,
- used_time
- from ods_coupon_use
- where dt='2020-03-10'
- )new
- on old.id=new.id;
复制代码
3)查询加载结果
- select * from dwd_fact_coupon_use where dt='2020-03-10';
复制代码
1.1.13 订单事实表(累积型快照事实表)
1)concat 函数
concat 函数在连接字符串的时候,只要其中一个是 NULL,那么将返回 NULL
- hive> select concat('a','b');
- ab
- hive> select concat('a','b',null);
- NULL
复制代码
2)concat_ws 函数
concat_ws 函数在连接字符串的时候,只要有一个字符串不是 NULL,就不会返回 NULL。concat_ws 函数需要指定分隔符
- hive> select concat_ws('-','a','b');
- a-b
- hive> select concat_ws('-','a','b',null);
- a-b
- hive> select concat_ws('','a','b',null);
- ab
复制代码
3)STR_TO_MAP 函数
STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)
使用 listDelimiter 将 text 分隔成 K-V 对,然后使用 keyValueDelimiter 分隔每个 K-V 对,
组装成 MAP 返回。默认 listDelimiter 为( ,),keyValueDelimiter 为(=)。
str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)
输出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}
4)建表语句
订单生命周期:创建时间=》支付时间=》取消时间=》完成时间=》退款时间=》退款完成时间
由于 ODS 层订单表只有创建时间和操作时间两个状态,不能表达所有时间含义,所以需要关联订单状态表。订单事实表里面增加了活动 id,所以需要关联活动订单表
- drop table if exists dwd_fact_order_info;
- create external table dwd_fact_order_info (
- `id` string COMMENT '订单编号',
- `order_status` string COMMENT '订单状态',
- `user_id` string COMMENT '用户 id',
- `out_trade_no` string COMMENT '支付流水号',
- `create_time` string COMMENT '创建时间(未支付状态)',
- `payment_time` string COMMENT '支付时间(已支付状态)',
- `cancel_time` string COMMENT '取消时间(已取消状态)',
- `finish_time` string COMMENT '完成时间(已完成状态)',
- `refund_time` string COMMENT '退款时间(退款中状态)',
- `refund_finish_time` string COMMENT '退款完成时间(退款完成状态)',
- `province_id` string COMMENT '省份 ID',
- `activity_id` string COMMENT '活动 ID',
- `original_total_amount` string COMMENT '原价金额',
- `benefit_reduce_amount` string COMMENT '优惠金额',
- `feight_fee` string COMMENT '运费',
- `final_total_amount` decimal(10,2) COMMENT '订单金额'
- )
- PARTITIONED BY (`dt` string)
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_fact_order_info/'
- tblproperties ("parquet.compression"="lzo");
复制代码
5)数据装载
5)常用函数
更多函数请点击博客【HIve】Hive入门解析(五)
6)数据装载
- set hive.exec.dynamic.partition.mode=nonstrict;
- insert overwrite table dwd_fact_order_info partition(dt)
- select
- if(new.id is null,old.id,new.id),
- if(new.order_status is null,old.order_status,new.order_status),
- if(new.user_id is null,old.user_id,new.user_id),
- if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
- if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态
- if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
- if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
- if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
- if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
- if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
- if(new.province_id is null,old.province_id,new.province_id),
- if(new.activity_id is null,old.activity_id,new.activity_id),
- if(new.original_total_amount is
- null,old.original_total_amount,new.original_total_amount),
- if(new.benefit_reduce_amount is
- null,old.benefit_reduce_amount,new.benefit_reduce_amount),
- if(new.feight_fee is null,old.feight_fee,new.feight_fee),
- if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
- date_format(if(new.tms['1001'] is
- null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
- from
- (
- select
- id,
- order_status,
- user_id,
- out_trade_no,
- create_time,
- payment_time,
- cancel_time,
- finish_time,
- refund_time,
- refund_finish_time,
- province_id,
- activity_id,
- original_total_amount,
- benefit_reduce_amount,
- feight_fee,
- final_total_amount
- from dwd_fact_order_info
- where dt
- in
- (
- select
- date_format(create_time,'yyyy-MM-dd')
- from ods_order_info
- where dt='2020-03-10'
- )
- )old
- full outer join
- (
- select
- info.id,
- info.order_status,
- info.user_id,
- info.out_trade_no,
- info.province_id,
- act.activity_id,
- log.tms,
- info.original_total_amount,
- info.benefit_reduce_amount,
- info.feight_fee,
- info.final_total_amount
- from
- (
- select
- order_id,
- str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=')
- tms
- from ods_order_status_log
- where dt='2020-03-10'
- group by order_id
- )log
- join
- (
- select * from ods_order_info where dt='2020-03-10'
- )info
- on log.order_id=info.id
- left join
- (
- select * from ods_activity_order where dt='2020-03-10'
- )act
- on log.order_id=act.order_id
- )new
- on old.id=new.id;
复制代码
6)查询加载结果
- select * from dwd_fact_order_info where dt='2020-03-10';
复制代码
1.1.14 用户维度表(拉链表)
用户表中的数据每日既有可能新增,也有可能修改,但修改频率并不高,属于缓慢变化
维度,此处采用拉链表存储用户维度数据
1)什么是拉链表
2)为什么要做拉链表
3)拉链表形成过程
4)拉链表制作过程图
5)拉链表制作过程
步骤 0:初始化拉链表(首次独立执行)
(1)建立拉链表
- drop table if exists dwd_dim_user_info_his;
- create external table dwd_dim_user_info_his(
- `id` string COMMENT '用户 id',
- `name` string COMMENT '姓名',
- `birthday` string COMMENT '生日',
- `gender` string COMMENT '性别',
- `email` string COMMENT '邮箱',
- `user_level` string COMMENT '用户等级',
- `create_time` string COMMENT '创建时间',
- `operate_time` string COMMENT '操作时间',
- `start_date` string COMMENT '有效开始日期',
- `end_date` string COMMENT '有效结束日期'
- ) COMMENT '订单拉链表'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
- tblproperties ("parquet.compression"="lzo");
复制代码
(2)初始化拉链表
- insert overwrite table dwd_dim_user_info_his
- select
- id,
- name,
- birthday,
- gender,
- email,
- user_level,
- create_time,
- operate_time,
- '2020-03-10',
- '9999-99-99'
- from ods_user_info oi
- where oi.dt='2020-03-10';
复制代码
步骤 1:制作当日变动数据(包括新增,修改)每日执行
(1)如何获得每日变动表
- a.最好表内有创建时间和变动时间(Lucky!)
- b.如果没有,可以利用第三方工具监控比如 canal,监控 MySQL 的实时变化进行记录(麻烦)
- c.逐行对比前后两天的数据,检查 md5(concat(全部有可能变化的字段))是否相同(low)
- d.要求业务数据库提供变动流水(人品,颜值)
(2)因为 ods_order_info 本身导入过来就是新增变动明细的表,所以不用处理
- a)数据库中新增 2020-03-11 一天的数据
- b)通过 Sqoop 把 2020-03-11 日所有数据导入mysqlTohdfs.sh all 2020-03-11
- c)ods 层数据导入hdfs_to_ods_db.sh all 2020-03-11
步骤 2:先合并变动信息,再追加新增信息,插入到临时表中
1)建立临时表
- drop table if exists dwd_dim_user_info_his_tmp;
- create external table dwd_dim_user_info_his_tmp(
- `id` string COMMENT '用户 id',
- `name` string COMMENT '姓名',
- `birthday` string COMMENT '生日',
- `gender` string COMMENT '性别',
- `email` string COMMENT '邮箱',
- `user_level` string COMMENT '用户等级',
- `create_time` string COMMENT '创建时间',
- `operate_time` string COMMENT '操作时间',
- `start_date` string COMMENT '有效开始日期',
- `end_date` string COMMENT '有效结束日期'
- ) COMMENT '订单拉链临时表'
- stored as parquet
- location '/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/'
- tblproperties ("parquet.compression"="lzo");
复制代码
2)导入脚本
- insert overwrite table dwd_dim_user_info_his_tmp
- select * from
- (
- select
- id,
- name,
- birthday,
- gender,
- email,
- user_level,
- create_time,
- operate_time,
- '2020-03-11' start_date,
- '9999-99-99' end_date
- from ods_user_info where dt='2020-03-11'
- union all
- select
- uh.id,
- uh.name,
- uh.birthday,
- uh.gender,
- uh.email,
- uh.user_level,
- uh.create_time,
- uh.operate_time,
- uh.start_date,
- if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
- uh.end_date) end_date
- from dwd_dim_user_info_his uh left join
- (
- select
- *
- from ods_user_info
- where dt='2020-03-11'
- ) ui on uh.id=ui.id
- )his
- order by his.id, start_date;
复制代码
步骤 3:把临时表覆盖给拉链表
1)导入数据
- insert overwrite table dwd_dim_user_info_his
- select * from dwd_dim_user_info_his_tmp;
复制代码
2)查询导入数据
- select id, start_date, end_date from dwd_dim_user_info_his;
复制代码
1.1.15 DWD 层数据导入脚本
1)vim ods_to_dwd_db.sh
- #!/bin/bash
- APP=gmall
- hive=/opt/modules/hive/bin/hive
- # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
- if [ -n "$2" ] ;then
- do_date=$2
- else
- do_date=`date -d "-1 day" +%F`
- fi
- sql1="
- set hive.exec.dynamic.partition.mode=nonstrict;
-
- insert overwrite table ${APP}.dwd_dim_sku_info partition(dt='$do_date')
- select
- sku.id,
- sku.spu_id,
- sku.price,
- sku.sku_name,
- sku.sku_desc,
- sku.weight,
- sku.tm_id,
- ob.tm_name,
- sku.category3_id,
- c2.id category2_id,
- c1.id category1_id,
- c3.name category3_name,
- c2.name category2_name,
- c1.name category1_name,
- spu.spu_name,
- sku.create_time
- from
- (
- select * from ${APP}.ods_sku_info where dt='$do_date'
- )sku
- join
- (
- select * from ${APP}.ods_base_trademark where dt='$do_date'
- )ob on sku.tm_id=ob.tm_id
- join
- (
- select * from ${APP}.ods_spu_info where dt='$do_date'
- )spu on spu.id = sku.spu_id
- join
- (
- select * from ${APP}.ods_base_category3 where dt='$do_date'
- )c3 on sku.category3_id=c3.id
- join
- (
- select * from ${APP}.ods_base_category2 where dt='$do_date'
- )c2 on c3.category2_id=c2.id
- join
- (
- select * from ${APP}.ods_base_category1 where dt='$do_date'
- )c1 on c2.category1_id=c1.id;
-
-
- insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt='$do_date')
- select
- id,
- coupon_name,
- coupon_type,
- condition_amount,
- condition_num,
- activity_id,
- benefit_amount,
- benefit_discount,
- create_time,
- range_type,
- spu_id,
- tm_id,
- category3_id,
- limit_num,
- operate_time,
- expire_time
- from ${APP}.ods_coupon_info
- where dt='$do_date';
-
-
- insert overwrite table ${APP}.dwd_dim_activity_info partition(dt='$do_date')
- select
- info.id,
- info.activity_name,
- info.activity_type,
- rule.condition_amount,
- rule.condition_num,
- rule.benefit_amount,
- rule.benefit_discount,
- rule.benefit_level,
- info.start_time,
- info.end_time,
- info.create_time
- from
- (
- select * from ${APP}.ods_activity_info where dt='$do_date'
- )info
- left join
- (
- select * from ${APP}.ods_activity_rule where dt='$do_date'
- )rule on info.id = rule.activity_id;
-
-
- insert overwrite table ${APP}.dwd_fact_order_detail partition(dt='$do_date')
- select
- od.id,
- od.order_id,
- od.user_id,
- od.sku_id,
- od.sku_name,
- od.order_price,
- od.sku_num,
- od.create_time,
- oi.province_id,
- od.order_price*od.sku_num
- from
- (
- select * from ${APP}.ods_order_detail where dt='$do_date'
- ) od
- join
- (
- select * from ${APP}.ods_order_info where dt='$do_date'
- ) oi
- on od.order_id=oi.id;
-
-
- insert overwrite table ${APP}.dwd_fact_payment_info partition(dt='$do_date')
- select
- pi.id,
- pi.out_trade_no,
- pi.order_id,
- pi.user_id,
- pi.alipay_trade_no,
- pi.total_amount,
- pi.subject,
- pi.payment_type,
- pi.payment_time,
- oi.province_id
- from
- (
- select * from ${APP}.ods_payment_info where dt='$do_date'
- )pi
- join
- (
- select id, province_id from ${APP}.ods_order_info where dt='$do_date'
- )oi
- on pi.order_id = oi.id;
-
-
- insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt='$do_date')
- select
- id,
- user_id,
- order_id,
- sku_id,
- refund_type,
- refund_num,
- refund_amount,
- refund_reason_type,
- create_time
- from ${APP}.ods_order_refund_info
- where dt='$do_date';
-
-
- insert overwrite table ${APP}.dwd_fact_comment_info partition(dt='$do_date')
- select
- id,
- user_id,
- sku_id,
- spu_id,
- order_id,
- appraise,
- create_time
- from ${APP}.ods_comment_info
- where dt='$do_date';
-
-
- insert overwrite table ${APP}.dwd_fact_cart_info partition(dt='$do_date')
- select
- id,
- user_id,
- sku_id,
- cart_price,
- sku_num,
- sku_name,
- create_time,
- operate_time,
- is_ordered,
- order_time
- from ${APP}.ods_cart_info
- where dt='$do_date';
-
-
- insert overwrite table ${APP}.dwd_fact_favor_info partition(dt='$do_date')
- select
- id,
- user_id,
- sku_id,
- spu_id,
- is_cancel,
- create_time,
- cancel_time
- from ${APP}.ods_favor_info
- where dt='$do_date';
-
-
- insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)
- select
- if(new.id is null,old.id,new.id),
- if(new.coupon_id is null,old.coupon_id,new.coupon_id),
- if(new.user_id is null,old.user_id,new.user_id),
- if(new.order_id is null,old.order_id,new.order_id),
- if(new.coupon_status is null,old.coupon_status,new.coupon_status),
- if(new.get_time is null,old.get_time,new.get_time),
- if(new.using_time is null,old.using_time,new.using_time),
- if(new.used_time is null,old.used_time,new.used_time),
- date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
- from
- (
- select
- id,
- coupon_id,
- user_id,
- order_id,
- coupon_status,
- get_time,
- using_time,
- used_time
- from ${APP}.dwd_fact_coupon_use
- where dt in
- (
- select
- date_format(get_time,'yyyy-MM-dd')
- from ${APP}.ods_coupon_use
- where dt='$do_date'
- )
- )old
- full outer join
- (
- select
- id,
- coupon_id,
- user_id,
- order_id,
- coupon_status,
- get_time,
- using_time,
- used_time
- from ${APP}.ods_coupon_use
- where dt='$do_date'
- )new
- on old.id=new.id;
-
-
- insert overwrite table ${APP}.dwd_fact_order_info partition(dt)
- select
- if(new.id is null,old.id,new.id),
- if(new.order_status is null,old.order_status,new.order_status),
- if(new.user_id is null,old.user_id,new.user_id),
- if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
- if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态
- if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
- if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
- if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
- if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
- if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
- if(new.province_id is null,old.province_id,new.province_id),
- if(new.activity_id is null,old.activity_id,new.activity_id),
- if(new.original_total_amount is
- null,old.original_total_amount,new.original_total_amount),
- if(new.benefit_reduce_amount is
- null,old.benefit_reduce_amount,new.benefit_reduce_amount),
- if(new.feight_fee is null,old.feight_fee,new.feight_fee),
- if(new.final_total_amount is
- null,old.final_total_amount,new.final_total_amount),
- date_format(if(new.tms['1001'] is
- null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
- from
- (
- select
- id,
- order_status,
- user_id,
- out_trade_no,
- create_time,
- payment_time,
- cancel_time,
- finish_time,
- refund_time,
- refund_finish_time,
- province_id,
- activity_id,
- original_total_amount,
- benefit_reduce_amount,
- feight_fee,
- final_total_amount
- from ${APP}.dwd_fact_order_info
- where dt
- in
- (
- select
- date_format(create_time,'yyyy-MM-dd')
- from ${APP}.ods_order_info
- where dt='$do_date'
- )
- )old
- full outer join
- (
- select
- info.id,
- info.order_status,
- info.user_id,
- info.out_trade_no,
- info.province_id,
- act.activity_id,
- log.tms,
- info.original_total_amount,
- info.benefit_reduce_amount,
- info.feight_fee,
- info.final_total_amount
- from
- (
- select
- order_id,
- str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','
- =') tms
- from ${APP}.ods_order_status_log
- where dt='$do_date'
- group by order_id
- )log
- join
- (
- select * from ${APP}.ods_order_info where dt='$do_date'
- )info
- on log.order_id=info.id
- left join
- (
- select * from ${APP}.ods_activity_order where dt='$do_date'
- )act
- on log.order_id=act.order_id
- )new
- on old.id=new.id;
-
-
- insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
- select * from
- (
- select
- id,
- name,
- birthday,
- gender,
- email,
- user_level,
- create_time,
- operate_time,
- '$do_date' start_date,
- '9999-99-99' end_date
- from ${APP}.ods_user_info where dt='$do_date'
- union all
- select
- uh.id,
- uh.name,
- uh.birthday,
- uh.gender,
- uh.email,
- uh.user_level,
- uh.create_time,
- uh.operate_time,
- uh.start_date,
- if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
- uh.end_date) end_date
- from ${APP}.dwd_dim_user_info_his uh left join
- (
- select
- *
- from ${APP}.ods_user_info
- where dt='$do_date'
- ) ui on uh.id=ui.id
- )his
- order by his.id, start_date;
-
-
- insert overwrite table ${APP}.dwd_dim_user_info_his select * from
- ${APP}.dwd_dim_user_info_his_tmp;
- "
-
- sql2="
- insert overwrite table ${APP}.dwd_dim_base_province
- select
- bp.id,
- bp.name,
- bp.area_code,
- bp.iso_code,
- bp.region_id,
- br.region_name
- from ${APP}.ods_base_province bp
- join ${APP}.ods_base_region br
- on bp.region_id=br.id;
- "
-
- case $1 in
- "first"){
- $hive -e "$sql1"
- $hive -e "$sql2"
- };;
- "all"){
- $hive -e "$sql1"
- };;
- esac
复制代码
2)增加脚本执行权限
- chmod 770 ods_to_dwd_db.sh
复制代码
3)执行脚本导入数据
- ods_to_dwd_db.sh all 2020-03-11
复制代码
4)查看导入数据
- select * from dwd_fact_order_info where dt='2020-03-11';
- select * from dwd_fact_order_detail where dt='2020-03-11';
- select * from dwd_fact_comment_info where dt='2020-03-11';
- select * from dwd_fact_order_refund_info where dt='2020-03-11';
复制代码
结束语
本章着重介绍了DWD层的业务数据的搭建流程,自此,DWD层搭建完成,下章开启DWS层的搭建!!!
加微信w3aboutyun,获取更多资源
领取100本书+1T资源
http://www.aboutyun.com/forum.php?mod=viewthread&tid=26480
大数据5个项目视频
http://www.aboutyun.com/forum.php?mod=viewthread&tid=25235
名企资源、名企面试题、最新BAT面试题、专题面试题等资源汇总
https://www.aboutyun.com/forum.php?mod=viewthread&tid=27732
文章来源:https://together.blog.csdn.net/article/details/105924483
|