本帖最后由 林宝宝 于 2019-9-23 22:19 编辑
问题导读:
1.本次统计硅谷影音视频网站的哪些常规指标?
2.如何进行数据清洗以及清洗目标是什么?
3.实现统计的具体代码是什么?
Hive实战之影音系统
数据(https://download.csdn.net/download/kevin__durant/11798895)这上面还要积分
直接百度网盘下载:
链接: https://pan.baidu.com/s/1y9B93W2v2maLpM863qyItg 提取码: x4z5
数据格式↓前九行分别对应表名,最后对应相关视频
LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o
hive的一些基础经典习题↑
需求描述
统计硅谷影音视频网站的常规指标,各种TopN指标:
–统计视频观看数Top10
–统计视频类别热度Top10
–统计视频观看数Top20所属类别
–统计视频观看数Top50所关联视频的所属类别Rank
–统计每个类别中的视频热度Top10
–统计每个类别中视频流量Top10
–统计上传视频最多的用户Top10以及他们上传的视频
–统计每个类别视频观看数Top10
项目
数据结构
1.视频表
2.用户表
2 ETL原始数据
​ 通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。
原始数据在目录下,此处使用MapReduce清洗数据
数据清洗
清洗工具类
[mw_shl_code=java,true]public String dataRinse(String str){
String[] split = str.split("\t");
//过滤没有视频的用户
if (split.length<9){
return "";
}
//将用户中的空格替换掉
split[3] = split[3].replaceAll(" ","");
//将后面的视频的数据合并为&分割的数据
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < split.length; i++) {
if (i<9){
if (i==split.length-1){
stringBuilder.append(split);
}else {
stringBuilder.append(split).append("\t");
}
}else {
if (i==split.length-1){
stringBuilder.append(split);
}else {
stringBuilder.append(split).append("&");
}
}
}
return stringBuilder.toString();
}
public static void main(String[] args) {
String s = new ETLUtil().dataRinse("uFoWXi25RBk");
System.out.println(s);
}
[/mw_shl_code]
清洗Mapper端
[mw_shl_code=java,true]public class ETLMapper extends Mapper<LongWritable , Text , Text , NullWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获得数据
String line = value.toString();
//清洗
String s = new ETLUtil().dataRinse(line);
k.set(s);
//输出
context.write(k , NullWritable.get());
}
}[/mw_shl_code]
清洗驱动Driver
[mw_shl_code=java,true]public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(ETLDriver.class);
job.setMapperClass(ETLMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public void setConf(Configuration conf) {
this.configuration = conf;
}
public Configuration getConf() {
return configuration;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new ETLDriver(), args);
System.out.println(run);
}
}[/mw_shl_code]
将vedio执行清洗
hive数据分析
建表
创建表:gulivideo_ori gulivideo_user_ori
[mw_shl_code=sql,true]create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;[/mw_shl_code]
[mw_shl_code=sql,true]create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;[/mw_shl_code]
导入数据
将video数据导入前表,将user导入后表直接load data 即可
业务分析
统计视频观看数Top10
直接单表排序分页
[mw_shl_code=sql,true]select uploader,views
from gulivideo_ori
order by views desc
limit 10;[/mw_shl_code]
统计视频类别热度Top10
[mw_shl_code=sql,true]select 3.取出前十
t3.cate,t3.cou_cate
from
(
select 2.统计没类的热度
t2.cate cate , count(*) cou_cate
from
(
select t1.ca cate 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
group by t2.cate
)t3
order by t3.cou_cate
limit 10[/mw_shl_code]
统计视频观看数Top20所属类别
[mw_shl_code=sql,true]select 3.对相同类别去重
distinct(cate)
from
(
select 2.取出前二十观看数,类别,和视频id
cate,views,videoid
from
(
select t1.ca cate,videoid,views 1.将类别炸开
from gulivideo_ori lateral view explode(category) t1 as ca
)t2
order by views desc
limit 20
)t3[/mw_shl_code]
统计视频观看数Top50所关联视频的所属类别Rank
[mw_shl_code=sql,true]
select 5.排序rank
*
from
(
select 4.将合并的表的类别字段炸开,对组进行分组,统计count
t4.category , count(*) hot
from
(
select 3.然后与原表再连接join
*
from
(
select 2.因为关联视频字段是数组,将其炸开并对相关视频字段去重
distinct(relatedids_name)
from
(
select * 1.取出观看次数top50
from gulivideo_ori
order by views desc
limit 50
)t1
lateral view explode(t1.relatedid) relatedids_t as relatedids_name
)t2
join gulivideo_ori t3
where t2.relatedids_name=t3.videoid
)t4
lateral view explode(t4.category) category_t as category_name
group by t4.category
)t5
order by t5.hot desc[/mw_shl_code]
下面几个业务都遇到了需要将类别炸开的形式,那就先将类别炸开的表先导入临时表
[mw_shl_code=sql,true] create table gulivideo_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated [/mw_shl_code]
[mw_shl_code=sql,true]
insert into table gulivideo_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
gulivideo_orc lateral view explode(category) catetory as categoryId;[/mw_shl_code]
统计每个类别中的视频热度Top10
[mw_shl_code=sql,true]select 2.取出top10
t1.categoryId,
t1.views,
from
(
select 1.按类别分组,观看次数排序
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]
统计每个类别中视频流量Top10
[mw_shl_code=sql,true]
select
t1.categoryId,
t1.ratings
from
(
select
categoryId,
ratings,
row_number() over(partition by categoryId order by ratings desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]
统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
[mw_shl_code=sql,true]
select
t2.uploader,
t2.views
from
(
select
*
from gulivideo_user_ori
order by videos desc
limit 20
)t1
join
(
select
*
from gulivideo_ori
)t2
where t1.uploader=t2.uploader
order by views desc
limit 20[/mw_shl_code]
统计每个类别视频观看数Top10
[mw_shl_code=sql,true]
select
t1.categoryId,
t1.views,
from
(
select
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from gulivideo_category
)t1
where rank <= 10[/mw_shl_code]
————————————————
版权声明:本文为CSDN博主「Kevin__Durant」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/Kevin__Durant/article/details/101115361
更多精彩文章,欢迎关注公众号
|
|