开启服务的方式连接hive:
bin/beeline!connect jdbc:hive2://localhost:10000
假如有一个web系统,每天生成以下日志文件:
-09-15号的数据:
192.168.33.6,hunter,-09-15 10:30:20,/a192.168.33.7,hunter,-09-15 10:30:26,/b192.168.33.6,jack,-09-15 10:30:27,/a192.168.33.8,tom,-09-15 10:30:28,/b192.168.33.9,rose,-09-15 10:30:30,/b192.168.33.10,julia,-09-15 10:30:40,/c
-09-16号的数据:
192.168.33.16,hunter,-09-16 10:30:20,/a192.168.33.18,jerry,-09-16 10:30:30,/b192.168.33.26,jack,-09-16 10:30:40,/a192.168.33.18,polo,-09-16 10:30:50,/b192.168.33.39,nissan,-09-16 10:30:53,/b192.168.33.39,nissan,-09-16 10:30:55,/a192.168.33.39,nissan,-09-16 10:30:58,/c192.168.33.20,ford,-09-16 10:30:54,/c
-09-17号的数据:
192.168.33.46,hunter,-09-17 10:30:21,/a192.168.43.18,jerry,-09-17 10:30:22,/b192.168.43.26,tom,-09-17 10:30:23,/a192.168.53.18,bmw,-09-17 10:30:24,/b192.168.63.39,benz,-09-17 10:30:25,/b192.168.33.25,haval,-09-17 10:30:30,/c192.168.33.10,julia,-09-17 10:30:40,/c
需求:
统计每天的活跃用户(日活)(需要用户的ip,用户的账号,用户访问时间最早的一条url和时间)
统计每天的新增用户(日新)
第一步:建立一个表,来存储每天新增的数据(分区表)
create table testdemo(ip string,uid string,access_time string ,url string)partitioned by (day string)row format delimitedfields terminated by ',';
放数据:
load data local inpath '/tmp/log.15' into table testdemo partition(day='-08-15');load data local inpath '/tmp/log.16' into table testdemo partition(day='-08-16');load data local inpath '/tmp/log.17' into table testdemo partition(day='-08-17');
第二步:建一个分区表,放日活跃用户 (用户去重):分隔符默认"/001"
create table t_user_active_day(ip string,uid string,access_time string ,url string)partitioned by (day string);
统计日活;
insert into table t_user_active_day partition(day='-08-15')select ip,uid,access_time,urlfrom(select ip,uid,access_time,url,row_number() over(partition by uid order by access_time) as rnfrom testdemowhere day='-08-15') tmpwhere rn=1;insert into table t_user_active_day partition(day='-08-16')select ip,uid,access_time,urlfrom(select ip,uid,access_time,url,row_number() over(partition by uid order by access_time) as rnfrom testdemowhere day='-08-16') tmpwhere rn=1;insert into table t_user_active_day partition(day='-08-17')select ip,uid,access_time,urlfrom(select ip,uid,access_time,url,row_number() over(partition by uid order by access_time) as rnfrom testdemowhere day='-08-17') tmpwhere rn=1;
第三步:为了统计日新,建表
建一个历史用户表:
create table t_user_history(uid string);
建一个新用户表:
create table t_user_new_day like t_user_active_day;
第四步:使用每天的的活跃用户与历史表进行关联,找到新增的用户,并放到新增用户表里;
insert into table t_user_new_day partition(day='-08-15')select ip,uid,access_time,urlfrom (select a.ip,a.uid,a.access_time,a.url,b.uid as b_uidfrom t_user_active_day a left join t_user_history b on a.uid=b.uidwhere a.day='-08-15') tmpwhere tmp.b_uid is null;
第五步:
将新增用户表里的数据,放到历史用户表里。
insert into table t_user_historyselect uid fromt_user_new_day where day='-08-15';
最后:重复第四步和第五步
insert into table t_user_new_day partition(day='-08-16')select ip,uid,access_time,urlfrom (select a.ip,a.uid,a.access_time,a.url,b.uid as b_uidfrom t_user_active_day a left join t_user_history b on a.uid=b.uidwhere a.day='-08-16') tmpwhere tmp.b_uid is null;insert into table t_user_historyselect uid fromt_user_new_day where day='-08-16';insert into table t_user_new_day partition(day='-08-17')select ip,uid,access_time,urlfrom (select a.ip,a.uid,a.access_time,a.url,b.uid as b_uidfrom t_user_active_day a left join t_user_history b on a.uid=b.uidwhere a.day='-08-17') tmpwhere tmp.b_uid is null;insert into table t_user_historyselect uid fromt_user_new_day where day='-08-17';
注意:重复操作应该放在脚本里定时每天执行,此处只是demo,仅供参考!!!