欢迎光临散文网 会员登陆 & 注册

hive

2023-06-09 22:17 作者:alpha-H111  | 我要投稿

共享单车模拟数据分析

要求如下

库名:hongyaa

表名:bike

 

字段:

持续时间 duration int,

开始时间 startdate timestamp,(注意时间戳在分析中的细节)

还车时间 enddate timestamp,

开始地编号 startnum int,

开始地名称 startstation string,

还车地编号 endnum int,

还车地名称 endstation string,

自行车编号 bikenum string,

用户类型 type string,(会员/临时会员)

 

 

以上面的字段创建一个名为bike的表,要求每一列以“制表符”为分割。

(答案为白色字体,下同)

create table bike (duration int,startdate timestamp,enddate timestamp,startnum int,startstation string,endnum int,endstation string,bikenum string,type string)

row format delimited fields terminated by ',';

 

检查表的字段:desc bike;(建表是一个得分点,表字段错了后面全做不了)

导入数据:load data local inpath '/root/bike.txt' into table bike;

 

启用本地计算模式,此举可缩短计算时间,平时可以对比试试看,比赛时建议使用

>set hive.exec.mode.local.auto=true;

 


数据分析(不分大小写,回车可过行,结束加分号)

1.一共调查了多少辆单车?写入“/root/test001/“用分号隔开   #3191

Insert overwrite local directory ‘/root/test001/’

Row format delimited fields terminated by ‘;’

select count(distinct(bikenum)) from bike;

 

 

2. 会员在全部受访者中占比是多少? #0.9

select round((t1.a/t2.b),2) from (select count(*) a from bike where type='Member') t1 join (select count(*) b from bike) t2;

 

另一种方法:分开两次算,先算会员数量,再算比例

Select count(*) a

From bike;

 

Select count(*) b 

from bike

where type='Member';

 

Select round((a/b),2); type=’Member’; #38220

Select (38220/count(*),2) from bike; #0.9

3.用车最多的是哪个地区?(以开始地点计算)#Columbus Circle / Union Station

select startstation,count(*) as a

from bike

group by startstation

order by a desc limit 1;

 

4.骑行时间最长是多少?的是哪辆车(车号)? #84876226 W23232

select duration,bikenum

from bike

order by duration desc limit 1;

 

5.这些单车平均骑行时间?(单位毫秒) #785938.47 (ms)

select avg(duration)

from bike;

 

6.这些单车平均骑行时间?(单位分钟) #13.10

select avg(duration/60000) //1分钟等于60000毫秒

from bike;

 

7.一月一日一共有多少辆车在工作? #1570

select count(distinct bikenum)

from bike

where month(startdate)=1 and day(startdate)=1;

 

8.正式会员最少的地区是哪个地区? #6035 Warehouse

select startstation,count(*) a

from bike

where type='Member'

group by startstation

order by a desc limit 1;

 

9.哪一天上班早高峰(6点-9点)用车量最大,并计算该段时间用车频次,用制表符分开。

row format delimited fields terminated by '\t'

select day(startdate) as a ,count(*) as b

from bike

where hour(stratedate) in (6,7,8)

group by a

order by b desc limit 1;

 

10.找出骑行时间最短的车号;w 20752

select min(duration) from bike;

select bikenum from bike

where duration='60777';

 

另一种方法:select bikenum from bike where duration in (select min(duration) from bike)

11.找出单车骑行时间大于平均骑行时间的车号及其骑行时间,使用','分隔,将结果保存在'/root/test011'中;

select avg(duration) from bike; 785938

 

insert overwrite local directory '/root/test011'

row format delimited fields terminated by ','

select bikenum,duration

from bike

where duration>'785938' ;

 

12.计算出在一月8日工作的车辆数,将结果保存在'/root/test012'中;952

insert overwrite local directory '/root/test012'                   1920

row format delimited fields terminated by ','

select count(distinct(bikenum))

from bike

where month =’1’ and day(startdate)='8'

 

 

13.找出正式会员最少的地区中临时会员的数量,使用','分隔,将结果保存在'/root/test013'中;

select startstation,count(*) a

from bike

where type='Member' group by startstation  

order by a desc limit 1; //6035 Warehouse

 

insert overwrite local directory '/root/test013'

select count(*) from bike

where type='Casual' and startstation='Warehouse';

 

 

14.找出1月9日下班高峰(17点-20点)用车数最多的地区,用','分隔,降序排序; #1st & M St NE,10

select startstation,count(*) a

from bike

where month=’1’ and day(startdate)='9' and hour(startdate) in (17,18,19,20) group by startstation order by a desc limit 1;   

 

Columbus Circle / Union Station 54


可视化

1. 绘出2017年1月1日到1月6日的用车情况(数量)折线图。

标题为:”2017年1月1日到1月6日会员与非会员用车情况“

标签依次为会员,非会员

横坐标为1日...6日

以开始时间为准

 

select type,count(*) from bike where  day(startdate)='1' group by type;

Casual 1580

Member 2484

select type,count(*) from bike where  day(startdate)='2' group by type;

Casual 181

Member 1459

select type,count(*) from bike where  day(startdate)='3' group by type;

Casual 254

Member 3520

select type,count(*) from bike where  day(startdate)='4' group by type;

Casual 748

Member 6438

select type,count(*) from bike where  day(startdate)='5' group by type;

Casual 402

Member 5814

select type,count(*) from bike where  day(startdate)='6' group by type;

Casual 309

Member 5263

 

       option = {

            title:{

                text:'2017一月1日到一月6日会员与非会员用车情况'

                },

            xAxis:{

                type:'category',

                data:['1日','2日','3日','4日','5日','6日']

                },

            legend:{

                data:['会员','非会员'],

                x:'right',

                },

            yAxis:{

                type:'value',

                },

            series:[

                {

                    name:'会员',

                    type:'line',

                    data:[2484,1459,3520,6438,5814,5263]

                },

                {

                    name: '非会员',

                    type: 'line',

                    data: [1580,181,254,748,402,309]

                }]

        };

 

 

 

 

2.各地会员数量前3的地区及数量和全体用户类型的双饼图

Select startstation,count(*) a

From bike

Where type=’Member’

Group by startstation

Order by a desc limit 3;

 

Select count(*) a

From bike

Where type=’Member’;

 

Select count(*) b

From bike

Where type=’Casual’;

 

Select round((a/b),5);

 

option={

    title:{

        text:'会员类别及会员数量前五的地区比例图',

        x:'center'

    },

    tooltip:{

        trigger:'item',

        formatter:"{a}<br/>{b}:{c}({d}%)"

    },

    legend:{

        orient:'vertical',

         x:'left',

        data:['Columbus Circle / \nUnion Station','15th & P \nSt NW','Thomas \nCircle','New Hampshire Ave &\n T St NW','Massachusetts Ave &\n Dupont Circle NW']

    },

    series:[

        {

            name:'用户类别',

            type:'pie',

            selectdMode:'single',

            radius:[0,'30%'],

            data:[

                {value:4185,name:'临时会员'},

               {value:38220,name:'会员',color:'bule'}

            ]

        },

        {

            name:'地区',

            type:'pie',

            radius:['40%','55%'],

            data:[

                {value:1029,name:'Columbus Circle / \nUnion Station'},

               {value:541,name:'15th & P \nSt NW'},

                {value:535,name:'Thomas \nCircle'},

               {value:527,name:'New Hampshire Ave &\n T St NW'},

                {value:480,name:'Massachusetts Ave &\n Dupont Circle NW'}

                ]

        }

    ]

};

 

持续时间 duration int,

开始时间 startdate timestamp,(注意时间戳在分析中的细节)

还车时间 enddate timestamp,

开始地编号 startnum int,

开始地名称 startstation string,

还车地编号 endnum int,

还车地名称 endstation string,

自行车编号 bikenum string,

用户类型 type string,(会员/临时会员)

 

3.以地点为横坐标,

绘出各地会员数量前5的地区及数量的柱状图,

title='会员的数量TOP5的地区'

 

 

Select startstation,count(*) a

From bike

Group by startstation

Order by a desc limit 5;

 

Columbus Circle / Union Station 1029

15th & P St NW 541

Thomas Circle 535

New Hampshire Ave & T St NW 527

Massachusetts Ave & Dupont Circle NW 480

 

        option = {

            title:{

                text:'会员数量前5的地区及数量',

                x:'center'

            },

            color: ['#388599'],

            tooltip : {

                trigger: 'axis'

            },

            grid: {

                left: '0%',

                right: '0%',

                bottom: '0%',

                containLabel: true

            },

            xAxis : [

                {

                    text:'地区',

                    type : 'category',

                    data : ['Columbus Circle / \nUnion Station', '15th & \nP St NW',

                    'Thomas\n Circle', 'New Hampshire Ave\n & T St NW',

                    'Massachusetts Ave &\n Dupont Circle NW'],

                    axisTick: {

                        alignWithLabel: true

                    }

                }

            ],

            yAxis : [

                {

                    type : 'value'

                }

            ],

            series : [

                {

                    name:'单车使用频次',

                    type:'bar',

                    barWidth: '50%',

                    data:[1029, 541, 535, 527, 480]

                }

            ]

        };

 

 

2019智警杯模拟数据分析

要求如下

库名:hongya

表名:final

 

字段:

案件编号 id int,

案件状态 state string,

案件副类别 class string,

损失金额 loss int,

损失程度 degree string,

作案手法 mode string,

案件来源 source string,

案发事件上限uppertime timestamp,

案发事件下限lowertime timestamp,

案发地点 place string,

受理单位 accept_unit string,

受理时间 accept_time timestamp,

报案时间 take_time timestamp,

警员  police string,

破案时间 break_time timestamp

 

 

以上面的字段创建一个名为final的表,要求每一列以“制表符”为分割。

(答案为白色字体,下同)

create table final(

id int,state string,class string,loss int,

degree string,mode string,source string,

uppertime timestamp,lowertime timestamp,

place string,accept_unit string,

accept_time timestamp,take_time timestamp,

police string,break_time timestamp)

row format delimited fields terminated by '\t';

 

检查表的字段:desc final(建表是一个得分点,表字段错了后面全做不了)

导入数据:load data inpath '目录/xxx.txt' into table final;

 

启用本地计算模式,此举可缩短计算时间,平时可以对比试试看,比赛时建议使用

>set hive.exec.mode.local.auto=true;

数据分析(不分大小写,回车可过行,结束加分号)

1.统计2017年3月份(以发案时间为准)的经济损失总额,将结果写入到/root/test001/中,要求使用 “制表符”作为声明文件分隔符;

insert owerwrite local directory ‘/root/test001/’

row format delimited fields terminated by ‘\t’

select sum(loss)

from final

where year(take_time) = 2017 and month(take_time) = 3;

 

 

2.找出经济损失最多的案件副类别,并给出该案件副类别对应的损失总额,将结果写入到/root/test002/中,要求使用 “制表符”作为声明文件分隔符。

Insert overwrite local directory ‘/root/test002/’

Row format delimited fields terminated by ‘\t’

Select class,sum(loss) total

from final

group by class

order by total desc limit 1;

 

 

3.统计2016年03月份发生案件总数,将结果写入到/root/test003/中,要求使用 “制表符”作为声明文件分隔符。

Insert overwrite local directory ’/root/test003/’

Row format delimited fields terminated by’\t’

Select count(*) from final where year(tabe_time)=2016 and mouth(take_time)=3;

 

 

4.在损失度为“特别巨大”的案件中,找出发生频次最高的案件副类别并统计其发生频次,将结果写入到/root/test004/中,要求使用 “制表符”作为声明文件分隔符.

Insert overwrite local directory ’/root/test004/’

Row format delimited fields terminated by’\t’

Select class,count(*) total

from final

where degree=’特别巨大’

Group by class

order by total desc limit 1;

 

 

5.列出诈骗最高发的地区及其对应的损失金额TOP3,将结果写入到/root/test005/中,要求使用 “制表符”作为声明文件分隔符;

Insert overwrite local directory ’/root/test005/’

Row format delimited fields terminated by’\t’

Select place,sum(loss) total

from final

where class like ‘%诈骗%’

group by place

order by total limit 3;

 

 

6.“短信诈骗”的发案时间平均为多久(即发案时间的下限(天)-发案时间的上限(天)),将结果写入到/root/test006/中,要求使用 “制表符”作为声明文件分隔符;

DATEDIFF() 函数返回两个日期之间的时间

语法格式:DATEDIFF(datepart,startdate,enddate)

Insert overwrite local directory ’/root/test006/’

Row format delimited fields terminated by’\t’

Select int(avg(datadiff(day,lowertime,uppertime)))

from final

Where class = ‘短信诈骗’;

 

 

7. 列出2016年春季的经济损失总额,将结果写入到/root/test007/中,要求使用 “制表符”作为声明文件分隔符;

Insert overwrite local directory ’/root/test007/’

Row format delimited fields terminated by’\t’

Select sum(loss) from final

where year(take_time)=2016 and month(take_time)=(1,2,3);

 

8.列出“深夜”时段受理案件最多的派出所及其受理案件数目。

(00-07为深夜,08-12为上午,13-19为下午,20-23为晚上)

格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件

分隔符。

复合排列:按照受理案件数据进行降序排列;

Row format delimited fields terminated by’\t’

Select accept_unit,count(*) total

from final

where hour(accept_time) in (0,1,2,3,4,5,6,7)

Group by accept_unit

Order by total desc limit 1;

 

9.列出网络诈骗中案发频次最高的作案手法top5及其对应案发次数;

格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件分隔符。

复合排列:按照案发次数进行降序排列;

Select mode,count(*) tatal from final

Where class=’网络诈骗’

Group by mode

Order by total desc limit 5;

 

列出2019年破获案件总金额最高的接警员(按姓氏)top10及其破获总金额。

格式:对于结果中的二维数据,要求使用“制表符”作为声明文件分隔符。

复合排列:按照金额数目进行降序排列;

 

Row format delimited fields terminated by ‘\t’

Select police,sum(loss) total from final

Where state=’破案’ and year(break_time)=2019

Group by police

Order by total desc limit 10;

 

 

11.统计在A城案件中的各案件副类别发生频次。

格式:对于结果中的二维数据,要求使用“制表符”作为声明文件分隔符。

复合排列:按照类别频次进行降序排列;

Select class,count(*) total from final

Where take_place like ’A城%’

Group by class

Order by total desc;

 

12.在损失金额超过18万的案件中,找出受理案件最多的派出所top5及其对应

受理案件数目。

格式:对于结果中的二维数据,要求使用 “制表符”作为声明文件分隔符。

复合排列:先按照受理案件数目进行降序排列,再按照派出所名称升序排列;

 

Select accept_unit,count(*) cases from final

Where loss > 180000

Group by accept_unit

Order by cases desc,accept_unit limit 5;

 

 

13.统计出C城深夜时段发生案件总次数

 

Select count(*) from final

Where hour(take_time) in (0,1,2,3,4,5,6,7) and place=’C城’;

 

14.统计出2016年下半年破案数top10的警察及其破案数

 

Select police,count(*) total from final

Where state=’破案’ and year(break_time)=2016 and month(break_time) in (6,7,8,9,10,11,12)

Group by police

Order by total desc limit 10;

 

15.统计近年来损失程度较大的犯罪案件,显示年份和数量和经济损失,按年份排序

 

Select year(take_time) as a,count(*),sum(loss) from final

Where degree=’较大’

Group by a

Order by a;

 

16.统计所有诈骗手法,次数及金额

 

Select mode,count(*),sum(loss)

from final

Group by mode;

 

T17.统计2018年所有城市的破案率,保留两位小数 【17,18,19,20有更好的方法(但是我不会。。。)】

 

①Select count(*) from final where year(take_time)=2018 and year(break_time)=2018;

②Select count(*) from final

where year(take_time)=2018 and year(break_time)=2018 and state=’破案’;

③select round(①/②,2);

 

 

18.2019年发生最多的案件副类别的破案率?#网络诈骗 0.77

select class,count(*) as total from final where year(take_time)=2019 group by class order by total desc limit 1;    #网络诈骗 244

select count(*)/244 from final where year(take_time)=2019 and year(break_time)=2019 and class='网络诈骗';

 

 

19.比网络诈骗破案率高的案件副类别有什么? #电话诈骗,短信诈骗

select class,count(*) as total from final where state='破案' group by class order by total desc;

select class,count(*) as total from final group by class order by total desc;

select (1258/1688); #0.75

select (163/208); #0.78

select (83/104); #0.80

 

 

 

20.各民警在2017年8月的电信诈骗破案率?

select count(*) a from final where year(take_time)=2017 and month(take_time)=8 and year(break_time)=2017 and month(break_time)=8 and class=’电信诈骗’;

 

select police,count(*) b from final where year(take_time)=2017 and month(take_time)=8 and year(break_time)=2017 and month(break_time)=8 and police like '%警官' and class=’电信诈骗’ group by police order by b;

Select (a/b)

可视化

1.绘出所有案件的损失程度比例图;(饼图)

要求:1.图形类别为饼状图;

title=‘案件损失程度比例图’;

标签legend依次为:较大,巨大,特别巨大

series.name=‘损失程度’;

 

insert overwrite local directory '/root/test101/'

row format delimited fields terminated by '\t'

select count(*) from final;

select round(count(*)/2000,2)*100 from final where degree='较大';#70

select round(count(*)/2000,2)*100 from final where degree='巨大';#22

select round(count(*)/2000,2)*100 from final where degree='特别巨大';#8

 

option = {

    title: {

        text: '案件损失程度比例图',

        x: 'center' //位置居中

    },

    tooltip: {

        trigger: 'item', //触发键,把鼠标放在图形上面触发

formatter:"{a}<br/>{b}:{c}({d}%)"

    },

    legend: {

        orient: 'vertical', //垂直放图例

        x: 'left', //图例放在左边

data:['较大','巨大','特别巨大']

    },

    series: [

        {

            name: '损失程度',

            type: 'pie',

selectdMode:'single',

            data: [

                {value: 70, name: '较大'},

                {value: 22, name: '巨大'},

                {value: 8, name: '特别巨大'}

            ],

        }

    ]}

 

 

2.绘出所有案件的作案手法比例图;(双饼图)

要求:

title=’ 案件副类别及作案手法比例图’;

legend标签依次为'冒充熟人','网络贷款','购物钓鱼网站

','网络购物','银行卡案件','虚拟物品','涉案转账','网络预

测彩票';

series.name=‘作案手法’;

图形类型必须为饼状图;

 

insert overwrite local directory '/root/test102/'

row format delimited fields terminated by '\t'

select mode,count(*) from final

group by mode;

 

option = {

    title: {

        text: '某站点用户访问来源',

        x: 'center' //位置居中

    },

    tooltip: {

        trigger: 'item', //触发键,把鼠标放在图形上面触发

formatter:"{a}<br/>{b}:{c}({d}%)"

    },

    legend: {

        orient: 'vertical', //垂直放图例

        x: 'left', //图例放在左边

data:['1','2']

    },

    series: [

        {

            name: '名称',

            type: 'pie',

selectdMode:'single',

            radius: '30%',

            data: [

                {value: 1048, name: '搜索引擎'},

                {value: 735, name: '直接访问'},

                {value: 580, name: '邮件营销'},

                {value: 484, name: '联盟广告'},

                {value: 300, name: '视频广告'}

            ],

        }

{

            name:'地区',

            type:'pie',

            radius:['40%','55%'],

            data:[

              {value: 1048, name: '搜索引擎'},

                {value: 735, name: '直接访问'},

                {value: 580, name: '邮件营销'},

                {value: 484, name: '联盟广告'},

                {value: 300, name: '视频广告'}

                ]

        }

    ]

};

 

 

 

3.绘出2015-2019年每年的案件副类别发生频次随年份的变化图;(折线图)

要求:1.图形类别为折线图;

title=‘2015年至2019年各案件副类别频次图’;

标签依次为电话诈骗,短信诈骗,网络诈骗;

横坐标时间顺序依次为2015年,2016年,2017年,2018年,2019年;

 

select class,count(*) from final where year(take_time)='2015'

group by class ;

电话诈骗 58

短信诈骗 29

网络诈骗 368

select class,count(*) from final where year(take_time)='2016'

group by class ;

电话诈骗 53

短信诈骗 23

网络诈骗 328

select class,count(*) from final where year(take_time)='2017'

group by class ;

电话诈骗 39

短信诈骗 19

网络诈骗 367

select class,count(*) from final where year(take_time)='2018'

group by class ;

电话诈骗 30

短信诈骗 24

网络诈骗 381

select class,count(*) from final where year(take_time)='2019'

group by class ;

电话诈骗 28

短信诈骗 9

网络诈骗 244

option = {

        title : {

            text: '2015年至2019年各案件副类别频次图',

x:'center'

             },

        legend: {

            data:['电话诈骗','网络诈骗','短信诈骗'], //标签

             x:'right',

             orient: 'vertical', //垂直放图例

        },

        xAxis : [

            {

                type : 'category', //类目

                data : ['2015年','2016年','2017年','2018年','2019年']

            }

        ],

        yAxis : [

            {

                type : 'value', //数值轴

             }

        ],

        series : [

           {

                name:'电话诈骗',

                type:'line', //图形类型—折线图

                data:[58,53,38,30,28]

            },

         {

                name:'短信诈骗',

                type:'line', //图形类型—折线图

                data:[29,23,19,24,9]

            },

        {

                name:'网络诈骗',

                type:'line', //图形类型—折线图

                data:[368,328,367,381,244]

            },

        ]

    };

 

 

4.2016年与2018年各案件副类别破案率对比图(柱状图)

要求:1.图形类别为柱状图;

title=‘2016年与2018年各案件副类别破案率对比图’;

标签依次为'2016年','2018年';

y轴为类目轴,其坐标时间顺序依次为电话诈骗破案率,短信诈骗破案率,网络诈骗破案率;

round精度为小数点后2位;

 


hive的评论 (共 条)

分享到微博请遵守国家法律