大数据项目之电商数仓（总结）（十四）:用户行为数据仓库

问题导读：
1、如何统计每日各类别下点击次数top10的商品？
2、如何使用日志数据用户行为宽表作为DWS层表？
3、如何统计点击次数最多的10个用户点击的商品次数top10？
4、如何统计月活跃用户与截止到该月累计的用户总和之间的比例？

上一篇：大数据项目之电商数仓（总结）（十三）:用户行为数据仓库

第18章需求十二：统计每日各类别下点击次数top10的商品
18.1 DWS层
使用点击日志表作为DWS层数据源
18.2 ADS层
18.2.1 建表语句

drop table if exists ads_goods_display_top10;
create external table ads_goods_display_top10 ( 
    `dt` string COMMENT '日期',
    `category` string COMMENT '品类',
    `goodsid` string COMMENT '商品id',
    `goods_count` string COMMENT '商品点击次数'
) 
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_display_top10';
复制代码

18.2.2 导入数据

insert into table ads_goods_display_top10
select
  '2019-10-03',
  category,
  goodsid,
  count
from(
  select
    category,
    goodsid,
    count,
    rank() over(partition by category order by count desc) rk
  from(
    select
      category,
      goodsid,
      count(*) count
    from dwd_display_log
    where dt='2019-10-03' and action=2
    group by category, goodsid
  )t1
)t2
where rk<=10;
复制代码

18.2.3 导入数据脚本
1）在/home/kgg/bin目录下创建脚本ads_goods_display_top10.sh

[kgg@hadoop102 bin]$ vim ads_goods_display_top10.sh
复制代码

在脚本中填写如下内容

#!/bin/bash
db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop/bin/hadoop

if [[ -n $1 ]]; then
    do_date=$1
else
    do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert into table ads_goods_display_top10
select
  '$do_date',
  category,
  goodsid,
  count
from(
  select
    category,
    goodsid,
    count,
    rank() over(partition by category order by count desc) rk
  from(
    select
      category,
      goodsid,
      count(*) count
    from dwd_display_log
    where dt='$do_date' and action=2
    group by category, goodsid
  )t1
)t2
where rk<=10
"

$hive -e "$sql"
复制代码

2）增加脚本执行权限

[kgg@hadoop102 bin]$ chmod 777 ads_goods_display_top10.sh
复制代码

3）脚本使用

[kgg@hadoop102 module]$ ads_goods_display_top10.sh 2019-02-20
复制代码

4）查询结果

hive (gmall)> select * from ads_goods_display_top10;
复制代码

5）脚本执行时间
企业开发中一般在每天凌晨30分~1点

第19章需求十三：点击次数最多的10个用户点击的商品次数top10
19.1 DWS层
使用日志数据用户行为宽表作为DWS层表
19.2 ADS层
19.2.1 建表语句

drop table if exists ads_goods_user_count;
create external table ads_goods_user_count( 
`dt` string COMMENT '统计日期',
`mid_id` string COMMENT '用户id',
    `u_ct` string COMMENT '用户总点击次数',
    `goodsid` string COMMENT '商品id',
    `d_ct` string COMMENT '各个商品点击次数'
) 
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_goods_user_count';
复制代码

19.2.2 导入数据

insert into table ads_goods_user_count
select
  '2019-10-03',
  mid_id,
  u_ct,
  goodsid,
  d_ct
from(
  select
    mid_id,
    u_ct,
    goodsid,
    d_ct,
    row_number() over(partition by mid_id order by d_ct desc ) rn
  from(
    select
      dl.mid_id,
      u_ct,
      dl.goodsid,
      count(*) d_ct
    from dwd_display_log dl join (
      select
        mid_id,
        count(*) u_ct
      from dws_user_action_wide_log
      group by mid_id
      order by u_ct desc
      limit 10
    )t1
    on dl.mid_id=t1.mid_id
    group by dl.mid_id, u_ct, dl.goodsid
  ) t2
) t3
where rn<=10
复制代码

19.2.3 导入数据脚本
1）在/home/kgg/bin目录下创建脚本ads_goods_user_count.sh

[kgg@hadoop102 bin]$ vim ads_goods_user_count.sh
复制代码

在脚本中填写如下内容

#!/bin/bash
db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop/bin/hadoop

if [[ -n $1 ]]; then
    do_date=$1
else
    do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert into table ads_goods_user_count
select
  '$do_date',
  mid_id,
  u_ct,
  goodsid,
  d_ct
from(
  select
    mid_id,
    u_ct,
    goodsid,
    d_ct,
    row_number() over(partition by mid_id order by d_ct desc ) rn
  from(
    select
      dl.mid_id,
      u_ct,
      dl.goodsid,
      count(*) d_ct
    from dwd_display_log dl join (
      select
        mid_id,
        count(*) u_ct
      from dws_user_action_wide_log
      group by mid_id
      order by u_ct desc
      limit 10
    )t1
on dl.mid_id=t1.mid_id
group by dl.mid_id, u_ct, dl.goodsid
  ) t2
) t3
where rn<=10
"

$hive -e "$sql"
复制代码

2）增加脚本执行权限

[kgg@hadoop102 bin]$ chmod 777 ads_goods_user_count.sh
复制代码

3）脚本使用

[kgg@hadoop102 module]$ ads_goods_user_count.sh 2019-02-20
复制代码

4）查询结果

hive (gmall)> select * from ads_goods_user_count;
复制代码

5）脚本执行时间
企业开发中一般在每天凌晨30分~1点

第20章需求十四：月活跃率
月活跃用户与截止到该月累计的用户总和之间的比例
20.1 DWS层
使用DWS层月活表以及ADS新增用户表作为DWS层
20.2 ADS层
20.2.1 建表语句

drop table if exists ads_mn_ratio_count;
create external table ads_mn_ratio_count( 
   `dt` string COMMENT '统计日期',
   `mn` string COMMENT '统计月活跃率的月份',
   `ratio` string COMMENT '活跃率'
) 
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_mn_ratio_count';
复制代码

20.2.2 导入数据

insert into table ads_mn_ratio_count
select
    '2019-10-03',
    date_format('2019-10-03','yyyy-MM'),
    mn_count/sum_user*100 mn_percent
from
(select count(*) mn_count from dws_uv_detail_mn where mn=date_format('2019-10-03','yyyy-MM')) t1,
(select sum(new_mid_count) sum_user from ads_new_mid_count) t2;
复制代码

20.2.3 导入数据脚本

1）在/home/kgg/bin目录下创建脚本ads_mn_ratio_count.sh
[kgg@hadoop102 bin]$ vim ads_mn_ratio_count.sh
复制代码

在脚本中填写如下内容

#!/bin/bash

db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop/bin/hadoop

if [[ -n $1 ]]; then
    do_date=$1
else
    do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert into table ads_mn_ratio_count
select
    '$do_date',
    date_format('$do_date','yyyy-MM'),
    mn_count/sum_user*100 mn_percent
from
(select count(*) mn_count from dws_uv_detail_mn where mn=date_format('$do_date','yyyy-MM')) t1,
(select sum(new_mid_count) sum_user from ads_new_mid_count) t2;
"

$hive -e "$sql"
复制代码

2）增加脚本执行权限

[kgg@hadoop102 bin]$ chmod 777 ads_mn_ratio_count.sh
复制代码

3）脚本使用

[kgg@hadoop102 module]$ ads_mn_ratio_count 2019-02-20
复制代码

4）查询结果

hive (gmall)> select * from ads_mn_ratio_count;
复制代码

5）脚本执行时间
企业开发中一般在每天凌晨30分~1点

最新经典文章，欢迎关注公众号

图文精华

大数据项目之电商数仓（总结）（十四）:用户行为数据仓库

最佳新人

热心会员

推荐 /2