有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
日期 用户 年龄
-02-11,test_1,23
-02-11,test_2,19
-02-11,test_3,39
-02-11,test_1,23
-02-11,test_3,39
-02-11,test_1,23
-02-12,test_2,19
-02-13,test_1,23
-02-15,test_2,19
-02-16,test_2,19
CREATE TABLE user_age (dt string,user_id string, age INT ) ROW format delimited FIELDS TERMINATED BY ',';
1)按照日期以及用户分组,按照日期排序并给出排名
selectdt,user_id,min(age)age,rank()over(partitionbyuser_idorderbydt)rkfromuser_agegroup bydt,user_id;t1
2)计算日期及排名的差值
selectuser_id,age,date_sub(dt,rk) flagfromt1;t2
3)过滤出差值大于等于2的,即为连续两天活跃的用户
selectuser_id,min(age) agefromt2group byuser_id,flaghavingcount(*)>=2;t3
4)对数据进行去重处理(一个用户可以在两个不同的时间点连续登录),例如:a用户在1月10号1月11号以及1月20号和1月21号4天登录。
selectuser_id,min(age) agefromt3group byuser_id;t4
5)计算活跃用户(两天连续有访问)的人数以及平均年龄
selectcount(*) ct,cast(sum(age)/count(*) as decimal(10,2))from t4;
6)对全量数据集进行按照用户去重
selectuser_id,min(age) agefromuser_agegroup byuser_id;t5
7)计算所有用户的数量以及平均年龄
selectcount(*) user_count,cast((sum(age)/count(*)) as decimal(10,1))fromt5;
8)将第5步以及第7步两个数据集进行unionall操作
select0 user_total_count,0 user_total_avg_age,count(*) twice_count,cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_agefrom(selectuser_id,min(age) agefrom(selectuser_id,min(age) agefrom(selectuser_id,age,date_sub(dt,rk) flagfrom(selectdt,user_id,min(age) age,rank() over(partition by user_id order by dt) rkfromuser_agegroup bydt,user_id)t1)t2group byuser_id,flaghavingcount(*)>=2)t3group byuser_id)t4union allselectcount(*) user_total_count,cast((sum(age)/count(*)) as decimal(10,1)),0 twice_count,0 twice_count_avg_agefrom(selectuser_id,min(age) agefromuser_agegroup byuser_id)t5;t6
9)计算最终结果
selectsum(user_total_count),sum(user_total_avg_age),sum(twice_count),sum(twice_count_avg_age)from(select0 user_total_count,0 user_total_avg_age,count(*) twice_count,cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_agefrom(selectuser_id,min(age) agefrom(selectuser_id,min(age) agefrom(selectuser_id,age,date_sub(dt,rk) flagfrom(selectdt,user_id,min(age) age,rank() over(partition by user_id order by dt) rkfromuser_agegroup bydt,user_id)t1)t2group byuser_id,flaghavingcount(*)>=2)t3group byuser_id)t4union allselectcount(*) user_total_count,cast((sum(age)/count(*)) as decimal(10,1)),0 twice_count,0 twice_count_avg_agefrom(selectuser_id,min(age) agefromuser_agegroup byuser_id)t5)t6;