CREATE TEMPORARY TABLE temp_user_behavior (
`user_id` string comment 'user ID',
`item_id` string comment 'item ID',
`category_id` string comment 'category ID',
`behavior_type` string comment 'behavior type among pv, buy, cart, fav',
`timestamp` int comment 'timestamp')
row format delimited
fields terminated by ','
lines terminated by '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/root/Hive/UserBehavior.csv' OVERWRITE INTO TABLE temp_user_behavior ;
drop table if exists user_behavior1;
create table user_behavior1 (
`user_id` string comment 'user ID',
`item_id` string comment 'item ID',
`category_id` string comment 'category ID',
`timestamp` timestamp comment 'timestamp'
)
PARTITIONED BY (`date` date, `behavior_type` string comment 'behavior type among pv, buy, cart, fav')
row format delimited
fields terminated by ','
lines terminated by '\n'
STORED AS ORC
TBLPROPERTIES ("orc.compress"="SNAPPY");
INSERT OVERWRITE TABLE user_behavior1 PARTITION (`date`, behavior_type)
SELECT
user_id,
item_id,
category_id,
from_unixtime(`timestamp`) AS `timestamp`,
date(from_unixtime(`timestamp`)) AS `date`,
behavior_type
FROM
temp_user_behavior;
select count(*) from user_behavior1;
+------------+
| _c0 |
+------------+
| 100150807 |
+------------+
-- 查看时间-数据分布情况,是否有异常值
select `date`, COUNT(*) from user_behavior1 group by `date` order by `date`;
-- 删除不在2017-11-25 到 2017-12-03日期的数据
alter table user_behavior1
drop IF EXISTS partition (`date`<'2017-11-25'), partition (`date`>'2017-12-03');
-- 再次查看时间是否有异常值
select `date`, COUNT(*) from user_behavior1 group by `date` order by `date`;
+-------------+-----------+
| date | _c1 |
+-------------+-----------+
| 2017-11-25 | 10511605 |
| 2017-11-26 | 10571046 |
| 2017-11-27 | 10013457 |
| 2017-11-28 | 9884189 |
| 2017-11-29 | 10319066 |
| 2017-11-30 | 10541698 |
| 2017-12-01 | 11171515 |
| 2017-12-02 | 13940949 |
| 2017-12-03 | 11961008 |
+-------------+-----------+
--查看 behavior_type 是否有异常值
select behavior_type, COUNT(*) from user_behavior1 group by behavior_type;
+----------------+-----------+
| behavior_type | _c1 |
+----------------+-----------+
| cart | 5466118 |
| pv | 88596903 |
| buy | 1998976 |
| fav | 2852536 |
+----------------+-----------+
-- 去掉完全重复的数据
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE user_behavior1
PARTITION (`date`, `behavior_type`)
SELECT DISTINCT `user_id`, `item_id`, `category_id`, `timestamp`, `date`, `behavior_type`
FROM user_behavior1
DISTRIBUTE BY `date`, `behavior_type`;
-- 查看目前多少条
select count(*) from user_behavior1;
+-----------+
| _c0 |
+-----------+
| 98914484 |
+-----------+