HIVE项目实战

本帖最后由林宝宝于 2019-9-23 22:19 编辑

问题导读：
1.本次统计硅谷影音视频网站的哪些常规指标？
2.如何进行数据清洗以及清洗目标是什么？
3.实现统计的具体代码是什么？

Hive实战之影音系统

数据（https://download.csdn.net/download/kevin__durant/11798895）这上面还要积分
直接百度网盘下载：
链接: https://pan.baidu.com/s/1y9B93W2v2maLpM863qyItg 提取码: x4z5

数据格式↓前九行分别对应表名，最后对应相关视频

LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o

hive的一些基础经典习题↑

需求描述

统计硅谷影音视频网站的常规指标，各种TopN指标：

–统计视频观看数Top10

–统计视频类别热度Top10

–统计视频观看数Top20所属类别

–统计视频观看数Top50所关联视频的所属类别Rank

–统计每个类别中的视频热度Top10

–统计每个类别中视频流量Top10

–统计上传视频最多的用户Top10以及他们上传的视频

–统计每个类别视频观看数Top10

项目
数据结构

1．视频表

微信截图_20190923175337.png

2．用户表

2 ETL原始数据

 通过观察原始数据形式，可以发现，视频可以有多个所属分类，每个所属分类用&符号分割，且分割的两边有空格字符，同时相关视频也是可以有多个元素，多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作，我们首先进行数据重组清洗操作。即：将所有的类别用“&”分割，同时去掉两边空格，多个相关视频id也使用“&”进行分割。

原始数据在目录下，此处使用MapReduce清洗数据

数据清洗
清洗工具类

[mw_shl_code=java,true]public String dataRinse(String str){
      String[] split = str.split("\t");
      //过滤没有视频的用户
      if (split.length<9){
         return "";
      }
      //将用户中的空格替换掉
      split[3] = split[3].replaceAll(" ","");
      //将后面的视频的数据合并为&分割的数据
      StringBuilder stringBuilder = new StringBuilder();
      for (int i = 0; i < split.length; i++) {
         if (i<9){
            if (i==split.length-1){
                  stringBuilder.append(split);
            }else {
                  stringBuilder.append(split).append("\t");
            }
         }else {
            if (i==split.length-1){
                  stringBuilder.append(split);
            }else {
                  stringBuilder.append(split).append("&");
            }
         }

      }
      return stringBuilder.toString();
}

public static void main(String[] args) {
      String s = new ETLUtil().dataRinse("uFoWXi25RBk");
      System.out.println(s);
}
[/mw_shl_code]

清洗Mapper端

[mw_shl_code=java,true]public class ETLMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
Text k = new Text();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
      //获得数据
      String line = value.toString();

      //清洗
      String s = new ETLUtil().dataRinse(line);
      k.set(s);

      //输出
      context.write(k , NullWritable.get());
}
}[/mw_shl_code]

清洗驱动Driver

[mw_shl_code=java,true]public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] args) throws Exception {
      Job job = Job.getInstance(getConf());

      job.setJarByClass(ETLDriver.class);

      job.setMapperClass(ETLMapper.class);

      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(NullWritable.class);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(NullWritable.class);
      job.setNumReduceTasks(0);

      FileInputFormat.setInputPaths(job,new Path(args[0]));
      FileOutputFormat.setOutputPath(job,new Path(args[1]));

      boolean b = job.waitForCompletion(true);
      return b ? 0 : 1;
}

public void setConf(Configuration conf) {
      this.configuration = conf;
}

public Configuration getConf() {
      return configuration;
}

public static void main(String[] args) throws Exception {
      int run = ToolRunner.run(new ETLDriver(), args);
      System.out.println(run);
}
}[/mw_shl_code]

将vedio执行清洗

hive数据分析
建表

创建表：gulivideo_ori gulivideo_user_ori

[mw_shl_code=sql,true]create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;[/mw_shl_code]

[mw_shl_code=sql,true]create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;[/mw_shl_code]

导入数据

将video数据导入前表，将user导入后表直接load data 即可

业务分析
统计视频观看数Top10

直接单表排序分页

[mw_shl_code=sql,true]select uploader,views
from gulivideo_ori
order by views desc
limit 10;[/mw_shl_code]

统计视频类别热度Top10

[mw_shl_code=sql,true]select 3.取出前十
t3.cate,t3.cou_cate
from
(
select 2.统计没类的热度
t2.cate cate , count(*) cou_cate
from
(
select t1.ca cate 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
group by t2.cate
)t3
order by t3.cou_cate
limit 10[/mw_shl_code]

统计视频观看数Top20所属类别

[mw_shl_code=sql,true]select 3.对相同类别去重
distinct(cate)
from
(
select 2.取出前二十观看数，类别，和视频id
cate,views,videoid
from
(
select t1.ca cate,videoid,views 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
order by views desc
limit 20
)t3[/mw_shl_code]

统计视频观看数Top50所关联视频的所属类别Rank
[mw_shl_code=sql,true]
select    5.排序rank
*
from
(
select    4.将合并的表的类别字段炸开，对组进行分组，统计count
t4.category , count(*) hot
from
(
select 3.然后与原表再连接join
*
from
(
select 2.因为关联视频字段是数组，将其炸开并对相关视频字段去重
distinct(relatedids_name)
from
(
select * 1.取出观看次数top50
from gulivideo_ori
order by views desc
limit 50
)t1
lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2
join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4
lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5
order by t5.hot desc[/mw_shl_code]

下面几个业务都遇到了需要将类别炸开的形式，那就先将类别炸开的表先导入临时表

[mw_shl_code=sql,true] create table gulivideo_category(
      videoId string,
      uploader string,
      age int,
      categoryId string,
      length int,
      views int,
      rate float,
      ratings int,
      comments int,
      relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated [/mw_shl_code]
[mw_shl_code=sql,true]
insert into table gulivideo_category
      select
         videoId,
         uploader,
         age,
         categoryId,
         length,
         views,
         rate,
         ratings,
         comments,
         relatedId
      from
         gulivideo_orc lateral view explode(category) catetory as categoryId;[/mw_shl_code]

统计每个类别中的视频热度Top10

[mw_shl_code=sql,true]select    2.取出top10
t1.categoryId,
t1.views,
from
(
select    1.按类别分组，观看次数排序
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]

统计每个类别中视频流量Top10
[mw_shl_code=sql,true]
select
t1.categoryId,
t1.ratings
from
(
select
categoryId,
ratings,
row_number() over(partition by categoryId order by ratings desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]

统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
[mw_shl_code=sql,true]
select
t2.uploader,
t2.views
from
(
select
*
from gulivideo_user_ori
order by videos desc
limit 20
)t1
join
(
select
*
from gulivideo_ori
)t2
where t1.uploader=t2.uploader
order by views desc
limit 20[/mw_shl_code]

统计每个类别视频观看数Top10
[mw_shl_code=sql,true]
select
t1.categoryId,
t1.views,
from
(
select
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]

————————————————
版权声明：本文为CSDN博主「Kevin__Durant」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/Kevin__Durant/article/details/101115361

更多精彩文章，欢迎关注公众号