hive
共享单车模拟数据分析
要求如下
库名:hongyaa
表名:bike
字段:
持续时间 duration int,
开始时间 startdate timestamp,(注意时间戳在分析中的细节)
还车时间 enddate timestamp,
开始地编号 startnum int,
开始地名称 startstation string,
还车地编号 endnum int,
还车地名称 endstation string,
自行车编号 bikenum string,
用户类型 type string,(会员/临时会员)
以上面的字段创建一个名为bike的表,要求每一列以“制表符”为分割。
(答案为白色字体,下同)
create table bike (duration int,startdate timestamp,enddate timestamp,startnum int,startstation string,endnum int,endstation string,bikenum string,type string)
row format delimited fields terminated by ',';
检查表的字段:desc bike;(建表是一个得分点,表字段错了后面全做不了)
导入数据:load data local inpath '/root/bike.txt' into table bike;
启用本地计算模式,此举可缩短计算时间,平时可以对比试试看,比赛时建议使用
>set hive.exec.mode.local.auto=true;
数据分析(不分大小写,回车可过行,结束加分号)
1.一共调查了多少辆单车?写入“/root/test001/“用分号隔开 #3191
Insert overwrite local directory ‘/root/test001/’
Row format delimited fields terminated by ‘;’
select count(distinct(bikenum)) from bike;
2. 会员在全部受访者中占比是多少? #0.9
select round((t1.a/t2.b),2) from (select count(*) a from bike where type='Member') t1 join (select count(*) b from bike) t2;
另一种方法:分开两次算,先算会员数量,再算比例
Select count(*) a
From bike;
Select count(*) b
from bike
where type='Member';
Select round((a/b),2); type=’Member’; #38220
Select (38220/count(*),2) from bike; #0.9
3.用车最多的是哪个地区?(以开始地点计算)#Columbus Circle / Union Station
select startstation,count(*) as a
from bike
group by startstation
order by a desc limit 1;
4.骑行时间最长是多少?的是哪辆车(车号)? #84876226 W23232
select duration,bikenum
from bike
order by duration desc limit 1;
5.这些单车平均骑行时间?(单位毫秒) #785938.47 (ms)
select avg(duration)
from bike;
6.这些单车平均骑行时间?(单位分钟) #13.10
select avg(duration/60000) //1分钟等于60000毫秒
from bike;
7.一月一日一共有多少辆车在工作? #1570
select count(distinct bikenum)
from bike
where month(startdate)=1 and day(startdate)=1;
8.正式会员最少的地区是哪个地区? #6035 Warehouse
select startstation,count(*) a
from bike
where type='Member'
group by startstation
order by a desc limit 1;
9.哪一天上班早高峰(6点-9点)用车量最大,并计算该段时间用车频次,用制表符分开。
row format delimited fields terminated by '\t'
select day(startdate) as a ,count(*) as b
from bike
where hour(stratedate) in (6,7,8)
group by a
order by b desc limit 1;
10.找出骑行时间最短的车号;w 20752
select min(duration) from bike;
select bikenum from bike
where duration='60777';
另一种方法:select bikenum from bike where duration in (select min(duration) from bike)
11.找出单车骑行时间大于平均骑行时间的车号及其骑行时间,使用','分隔,将结果保存在'/root/test011'中;
select avg(duration) from bike; 785938
insert overwrite local directory '/root/test011'
row format delimited fields terminated by ','
select bikenum,duration
from bike
where duration>'785938' ;
12.计算出在一月8日工作的车辆数,将结果保存在'/root/test012'中;952
insert overwrite local directory '/root/test012' 1920
row format delimited fields terminated by ','
select count(distinct(bikenum))
from bike
where month =’1’ and day(startdate)='8'
13.找出正式会员最少的地区中临时会员的数量,使用','分隔,将结果保存在'/root/test013'中;
select startstation,count(*) a
from bike
where type='Member' group by startstation
order by a desc limit 1; //6035 Warehouse
insert overwrite local directory '/root/test013'
select count(*) from bike
where type='Casual' and startstation='Warehouse';
14.找出1月9日下班高峰(17点-20点)用车数最多的地区,用','分隔,降序排序; #1st & M St NE,10
select startstation,count(*) a
from bike
where month=’1’ and day(startdate)='9' and hour(startdate) in (17,18,19,20) group by startstation order by a desc limit 1;
Columbus Circle / Union Station 54
可视化
1. 绘出2017年1月1日到1月6日的用车情况(数量)折线图。
标题为:”2017年1月1日到1月6日会员与非会员用车情况“
标签依次为会员,非会员
横坐标为1日...6日
以开始时间为准
select type,count(*) from bike where day(startdate)='1' group by type;
Casual 1580
Member 2484
select type,count(*) from bike where day(startdate)='2' group by type;
Casual 181
Member 1459
select type,count(*) from bike where day(startdate)='3' group by type;
Casual 254
Member 3520
select type,count(*) from bike where day(startdate)='4' group by type;
Casual 748
Member 6438
select type,count(*) from bike where day(startdate)='5' group by type;
Casual 402
Member 5814
select type,count(*) from bike where day(startdate)='6' group by type;
Casual 309
Member 5263
option = {
title:{
text:'2017一月1日到一月6日会员与非会员用车情况'
},
xAxis:{
type:'category',
data:['1日','2日','3日','4日','5日','6日']
},
legend:{
data:['会员','非会员'],
x:'right',
},
yAxis:{
type:'value',
},
series:[
{
name:'会员',
type:'line',
data:[2484,1459,3520,6438,5814,5263]
},
{
name: '非会员',
type: 'line',
data: [1580,181,254,748,402,309]
}]
};
2.各地会员数量前3的地区及数量和全体用户类型的双饼图
Select startstation,count(*) a
From bike
Where type=’Member’
Group by startstation
Order by a desc limit 3;
Select count(*) a
From bike
Where type=’Member’;
Select count(*) b
From bike
Where type=’Casual’;
Select round((a/b),5);
option={
title:{
text:'会员类别及会员数量前五的地区比例图',
x:'center'
},
tooltip:{
trigger:'item',
formatter:"{a}<br/>{b}:{c}({d}%)"
},
legend:{
orient:'vertical',
x:'left',
data:['Columbus Circle / \nUnion Station','15th & P \nSt NW','Thomas \nCircle','New Hampshire Ave &\n T St NW','Massachusetts Ave &\n Dupont Circle NW']
},
series:[
{
name:'用户类别',
type:'pie',
selectdMode:'single',
radius:[0,'30%'],
data:[
{value:4185,name:'临时会员'},
{value:38220,name:'会员',color:'bule'}
]
},
{
name:'地区',
type:'pie',
radius:['40%','55%'],
data:[
{value:1029,name:'Columbus Circle / \nUnion Station'},
{value:541,name:'15th & P \nSt NW'},
{value:535,name:'Thomas \nCircle'},
{value:527,name:'New Hampshire Ave &\n T St NW'},
{value:480,name:'Massachusetts Ave &\n Dupont Circle NW'}
]
}
]
};
持续时间 duration int,
开始时间 startdate timestamp,(注意时间戳在分析中的细节)
还车时间 enddate timestamp,
开始地编号 startnum int,
开始地名称 startstation string,
还车地编号 endnum int,
还车地名称 endstation string,
自行车编号 bikenum string,
用户类型 type string,(会员/临时会员)
3.以地点为横坐标,
绘出各地会员数量前5的地区及数量的柱状图,
title='会员的数量TOP5的地区'
Select startstation,count(*) a
From bike
Group by startstation
Order by a desc limit 5;
Columbus Circle / Union Station 1029
15th & P St NW 541
Thomas Circle 535
New Hampshire Ave & T St NW 527
Massachusetts Ave & Dupont Circle NW 480
option = {
title:{
text:'会员数量前5的地区及数量',
x:'center'
},
color: ['#388599'],
tooltip : {
trigger: 'axis'
},
grid: {
left: '0%',
right: '0%',
bottom: '0%',
containLabel: true
},
xAxis : [
{
text:'地区',
type : 'category',
data : ['Columbus Circle / \nUnion Station', '15th & \nP St NW',
'Thomas\n Circle', 'New Hampshire Ave\n & T St NW',
'Massachusetts Ave &\n Dupont Circle NW'],
axisTick: {
alignWithLabel: true
}
}
],
yAxis : [
{
type : 'value'
}
],
series : [
{
name:'单车使用频次',
type:'bar',
barWidth: '50%',
data:[1029, 541, 535, 527, 480]
}
]
};
2019智警杯模拟数据分析
要求如下
库名:hongya
表名:final
字段:
案件编号 id int,
案件状态 state string,
案件副类别 class string,
损失金额 loss int,
损失程度 degree string,
作案手法 mode string,
案件来源 source string,
案发事件上限uppertime timestamp,
案发事件下限lowertime timestamp,
案发地点 place string,
受理单位 accept_unit string,
受理时间 accept_time timestamp,
报案时间 take_time timestamp,
警员 police string,
破案时间 break_time timestamp
以上面的字段创建一个名为final的表,要求每一列以“制表符”为分割。
(答案为白色字体,下同)
create table final(
id int,state string,class string,loss int,
degree string,mode string,source string,
uppertime timestamp,lowertime timestamp,
place string,accept_unit string,
accept_time timestamp,take_time timestamp,
police string,break_time timestamp)
row format delimited fields terminated by '\t';
检查表的字段:desc final;(建表是一个得分点,表字段错了后面全做不了)
导入数据:load data inpath '目录/xxx.txt' into table final;
启用本地计算模式,此举可缩短计算时间,平时可以对比试试看,比赛时建议使用
>set hive.exec.mode.local.auto=true;
数据分析(不分大小写,回车可过行,结束加分号)
1.统计2017年3月份(以发案时间为准)的经济损失总额,将结果写入到/root/test001/中,要求使用 “制表符”作为声明文件分隔符;
insert owerwrite local directory ‘/root/test001/’
row format delimited fields terminated by ‘\t’
select sum(loss)
from final
where year(take_time) = 2017 and month(take_time) = 3;
2.找出经济损失最多的案件副类别,并给出该案件副类别对应的损失总额,将结果写入到/root/test002/中,要求使用 “制表符”作为声明文件分隔符。
Insert overwrite local directory ‘/root/test002/’
Row format delimited fields terminated by ‘\t’
Select class,sum(loss) total
from final
group by class
order by total desc limit 1;
3.统计2016年03月份发生案件总数,将结果写入到/root/test003/中,要求使用 “制表符”作为声明文件分隔符。
Insert overwrite local directory ’/root/test003/’
Row format delimited fields terminated by’\t’
Select count(*) from final where year(tabe_time)=2016 and mouth(take_time)=3;
4.在损失度为“特别巨大”的案件中,找出发生频次最高的案件副类别并统计其发生频次,将结果写入到/root/test004/中,要求使用 “制表符”作为声明文件分隔符.
Insert overwrite local directory ’/root/test004/’
Row format delimited fields terminated by’\t’
Select class,count(*) total
from final
where degree=’特别巨大’
Group by class
order by total desc limit 1;
5.列出诈骗最高发的地区及其对应的损失金额TOP3,将结果写入到/root/test005/中,要求使用 “制表符”作为声明文件分隔符;
Insert overwrite local directory ’/root/test005/’
Row format delimited fields terminated by’\t’
Select place,sum(loss) total
from final
where class like ‘%诈骗%’
group by place
order by total limit 3;
6.“短信诈骗”的发案时间平均为多久(即发案时间的下限(天)-发案时间的上限(天)),将结果写入到/root/test006/中,要求使用 “制表符”作为声明文件分隔符;
DATEDIFF() 函数返回两个日期之间的时间
语法格式:DATEDIFF(datepart,startdate,enddate)
Insert overwrite local directory ’/root/test006/’
Row format delimited fields terminated by’\t’
Select int(avg(datadiff(day,lowertime,uppertime)))
from final
Where class = ‘短信诈骗’;
7. 列出2016年春季的经济损失总额,将结果写入到/root/test007/中,要求使用 “制表符”作为声明文件分隔符;
Insert overwrite local directory ’/root/test007/’
Row format delimited fields terminated by’\t’
Select sum(loss) from final
where year(take_time)=2016 and month(take_time)=(1,2,3);
8.列出“深夜”时段受理案件最多的派出所及其受理案件数目。
(00-07为深夜,08-12为上午,13-19为下午,20-23为晚上)
格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件
分隔符。
复合排列:按照受理案件数据进行降序排列;
Row format delimited fields terminated by’\t’
Select accept_unit,count(*) total
from final
where hour(accept_time) in (0,1,2,3,4,5,6,7)
Group by accept_unit
Order by total desc limit 1;
9.列出网络诈骗中案发频次最高的作案手法top5及其对应案发次数;
格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件分隔符。
复合排列:按照案发次数进行降序排列;
Select mode,count(*) tatal from final
Where class=’网络诈骗’
Group by mode
Order by total desc limit 5;
;列出2019年破获案件总金额最高的接警员(按姓氏)top10及其破获总金额。
格式:对于结果中的二维数据,要求使用“制表符”作为声明文件分隔符。
复合排列:按照金额数目进行降序排列;
Row format delimited fields terminated by ‘\t’
Select police,sum(loss) total from final
Where state=’破案’ and year(break_time)=2019
Group by police
Order by total desc limit 10;
11.统计在A城案件中的各案件副类别发生频次。
格式:对于结果中的二维数据,要求使用“制表符”作为声明文件分隔符。
复合排列:按照类别频次进行降序排列;
Select class,count(*) total from final
Where take_place like ’A城%’
Group by class
Order by total desc;
12.在损失金额超过18万的案件中,找出受理案件最多的派出所top5及其对应
受理案件数目。
格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件分隔符。
复合排列:先按照受理案件数目进行降序排列,再按照派出所名称升序排列;
Select accept_unit,count(*) cases from final
Where loss > 180000
Group by accept_unit
Order by cases desc,accept_unit limit 5;
13.统计出C城深夜时段发生案件总次数
Select count(*) from final
Where hour(take_time) in (0,1,2,3,4,5,6,7) and place=’C城’;
14.统计出2016年下半年破案数top10的警察及其破案数
Select police,count(*) total from final
Where state=’破案’ and year(break_time)=2016 and month(break_time) in (6,7,8,9,10,11,12)
Group by police
Order by total desc limit 10;
15.统计近年来损失程度较大的犯罪案件,显示年份和数量和经济损失,按年份排序
Select year(take_time) as a,count(*),sum(loss) from final
Where degree=’较大’
Group by a
Order by a;
16.统计所有诈骗手法,次数及金额
Select mode,count(*),sum(loss)
from final
Group by mode;
T17.统计2018年所有城市的破案率,保留两位小数 【17,18,19,20有更好的方法(但是我不会。。。)】
①Select count(*) from final where year(take_time)=2018 and year(break_time)=2018;
②Select count(*) from final
where year(take_time)=2018 and year(break_time)=2018 and state=’破案’;
③select round(①/②,2);
18.2019年发生最多的案件副类别的破案率?#网络诈骗 0.77
select class,count(*) as total from final where year(take_time)=2019 group by class order by total desc limit 1; #网络诈骗 244
select count(*)/244 from final where year(take_time)=2019 and year(break_time)=2019 and class='网络诈骗';
19.比网络诈骗破案率高的案件副类别有什么? #电话诈骗,短信诈骗
select class,count(*) as total from final where state='破案' group by class order by total desc;
select class,count(*) as total from final group by class order by total desc;
select (1258/1688); #0.75
select (163/208); #0.78
select (83/104); #0.80
20.各民警在2017年8月的电信诈骗破案率?
select count(*) a from final where year(take_time)=2017 and month(take_time)=8 and year(break_time)=2017 and month(break_time)=8 and class=’电信诈骗’;
select police,count(*) b from final where year(take_time)=2017 and month(take_time)=8 and year(break_time)=2017 and month(break_time)=8 and police like '%警官' and class=’电信诈骗’ group by police order by b;
Select (a/b)
可视化
1.绘出所有案件的损失程度比例图;(饼图)
要求:1.图形类别为饼状图;
title=‘案件损失程度比例图’;
标签legend依次为:较大,巨大,特别巨大
series.name=‘损失程度’;
insert overwrite local directory '/root/test101/'
row format delimited fields terminated by '\t'
select count(*) from final;
select round(count(*)/2000,2)*100 from final where degree='较大';#70
select round(count(*)/2000,2)*100 from final where degree='巨大';#22
select round(count(*)/2000,2)*100 from final where degree='特别巨大';#8
option = {
title: {
text: '案件损失程度比例图',
x: 'center' //位置居中
},
tooltip: {
trigger: 'item', //触发键,把鼠标放在图形上面触发
formatter:"{a}<br/>{b}:{c}({d}%)"
},
legend: {
orient: 'vertical', //垂直放图例
x: 'left', //图例放在左边
data:['较大','巨大','特别巨大']
},
series: [
{
name: '损失程度',
type: 'pie',
selectdMode:'single',
data: [
{value: 70, name: '较大'},
{value: 22, name: '巨大'},
{value: 8, name: '特别巨大'}
],
}
]}
2.绘出所有案件的作案手法比例图;(双饼图)
要求:
title=’ 案件副类别及作案手法比例图’;
legend标签依次为'冒充熟人','网络贷款','购物钓鱼网站
','网络购物','银行卡案件','虚拟物品','涉案转账','网络预
测彩票';
series.name=‘作案手法’;
图形类型必须为饼状图;
insert overwrite local directory '/root/test102/'
row format delimited fields terminated by '\t'
select mode,count(*) from final
group by mode;
option = {
title: {
text: '某站点用户访问来源',
x: 'center' //位置居中
},
tooltip: {
trigger: 'item', //触发键,把鼠标放在图形上面触发
formatter:"{a}<br/>{b}:{c}({d}%)"
},
legend: {
orient: 'vertical', //垂直放图例
x: 'left', //图例放在左边
data:['1','2']
},
series: [
{
name: '名称',
type: 'pie',
selectdMode:'single',
radius: '30%',
data: [
{value: 1048, name: '搜索引擎'},
{value: 735, name: '直接访问'},
{value: 580, name: '邮件营销'},
{value: 484, name: '联盟广告'},
{value: 300, name: '视频广告'}
],
}
{
name:'地区',
type:'pie',
radius:['40%','55%'],
data:[
{value: 1048, name: '搜索引擎'},
{value: 735, name: '直接访问'},
{value: 580, name: '邮件营销'},
{value: 484, name: '联盟广告'},
{value: 300, name: '视频广告'}
]
}
]
};
3.绘出2015-2019年每年的案件副类别发生频次随年份的变化图;(折线图)
要求:1.图形类别为折线图;
title=‘2015年至2019年各案件副类别频次图’;
标签依次为电话诈骗,短信诈骗,网络诈骗;
横坐标时间顺序依次为2015年,2016年,2017年,2018年,2019年;
select class,count(*) from final where year(take_time)='2015'
group by class ;
电话诈骗 58
短信诈骗 29
网络诈骗 368
select class,count(*) from final where year(take_time)='2016'
group by class ;
电话诈骗 53
短信诈骗 23
网络诈骗 328
select class,count(*) from final where year(take_time)='2017'
group by class ;
电话诈骗 39
短信诈骗 19
网络诈骗 367
select class,count(*) from final where year(take_time)='2018'
group by class ;
电话诈骗 30
短信诈骗 24
网络诈骗 381
select class,count(*) from final where year(take_time)='2019'
group by class ;
电话诈骗 28
短信诈骗 9
网络诈骗 244
option = {
title : {
text: '2015年至2019年各案件副类别频次图',
x:'center'
},
legend: {
data:['电话诈骗','网络诈骗','短信诈骗'], //标签
x:'right',
orient: 'vertical', //垂直放图例
},
xAxis : [
{
type : 'category', //类目
data : ['2015年','2016年','2017年','2018年','2019年']
}
],
yAxis : [
{
type : 'value', //数值轴
}
],
series : [
{
name:'电话诈骗',
type:'line', //图形类型—折线图
data:[58,53,38,30,28]
},
{
name:'短信诈骗',
type:'line', //图形类型—折线图
data:[29,23,19,24,9]
},
{
name:'网络诈骗',
type:'line', //图形类型—折线图
data:[368,328,367,381,244]
},
]
};
4.2016年与2018年各案件副类别破案率对比图(柱状图)
要求:1.图形类别为柱状图;
title=‘2016年与2018年各案件副类别破案率对比图’;
标签依次为'2016年','2018年';
y轴为类目轴,其坐标时间顺序依次为电话诈骗破案率,短信诈骗破案率,网络诈骗破案率;
round精度为小数点后2位;