Doris 实时表使用Hive 数据初始化

Doris 实时表使用 Hive 数据进行数据初始化操作流程。


1. Doris table 建表语句确认

1
show create table dwd_homedo_real.dwd_sub_account_info;

2. Doris 临时表创建

因为Hive 表字段全部为小写,为了避免字段大小写不一致,所以需要创建doris全字段小写表

将步骤1查询到的交表语句字段修改为小写,并建表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
CREATE TABLE `dwd_sub_account_info_test` (
`account_id` bigint(20) NULL COMMENT "",
`account_name` varchar(1000) NULL COMMENT "",
`audit_company_id` bigint(20) NULL COMMENT "",
`audit_company_name` varchar(1000) NULL COMMENT "",
`org_code` varchar(1000) NULL COMMENT "",
`org_name` varchar(1000) NULL COMMENT "",
`is_credit` varchar(1000) NULL COMMENT "",
`credit_amount` varchar(1000) NULL COMMENT "",
`amount_cycle_bymonth` varchar(1000) NULL COMMENT "",
`amount_cycle_byday` varchar(1000) NULL COMMENT "",
`amount_cycle_type` varchar(1000) NULL COMMENT "",
`is_used_amount_cycle` varchar(1000) NULL COMMENT "",
`is_used_credit_amount` varchar(1000) NULL COMMENT "",
`integral_balance` varchar(1000) NULL COMMENT "",
`change_recommended_time` varchar(1000) NULL COMMENT "",
`recommended_id` varchar(1000) NULL COMMENT "",
`recommended_name` varchar(1000) NULL COMMENT "",
`recommended_source` varchar(1000) NULL COMMENT "",
`channel` varchar(1000) NULL COMMENT "",
`flag_id` varchar(1000) NULL COMMENT "",
`flag_name` varchar(1000) NULL COMMENT "",
`parent_account_id` varchar(1000) NULL COMMENT "",
`follower_account_id` varchar(1000) NULL COMMENT "",
`ketuo_xiaoneng_id` varchar(1000) NULL COMMENT "",
`aladdin_id` varchar(1000) NULL COMMENT "",
`ctype` varchar(1000) NULL COMMENT "",
`ctype_name` varchar(1000) NULL COMMENT "",
`is_inner` varchar(1000) NULL COMMENT "",
`is_handler` varchar(1000) NULL COMMENT "",
`is_effect` varchar(1000) NULL COMMENT "",
`is_channel` varchar(1000) NULL COMMENT "",
`is_jicai` varchar(1000) NULL COMMENT "",
`jicai_audit_time` varchar(1000) NULL COMMENT "",
`jicai_overdue_time` varchar(1000) NULL COMMENT "",
`registered_time` varchar(1000) NULL COMMENT "",
`audit_time` varchar(1000) NULL COMMENT "",
`audit_status` int(11) NULL COMMENT "",
`audit_status_name` varchar(1000) NULL COMMENT "",
`account_status` varchar(1000) NULL COMMENT "",
`account_status_name` varchar(1000) NULL COMMENT "",
`is_vip` bigint(20) NULL COMMENT "",
`vip_level` varchar(1000) NULL COMMENT ""
) ENGINE=OLAP
UNIQUE KEY(`account_id`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`account_id`) BUCKETS 3
PROPERTIES (
"replication_num" = "3",
"in_memory" = "false",
"storage_format" = "V2"
);

3. Hive 临时表创建

字段顺序和 Doirs table一样,hive 不区分大小写,默认小写

Hive 临时表创建的目的有两个:

  1. 为了统一Hive 和 Doris的字段顺序和字段数量
  2. 将Hive 的存储类型统一为 parquet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
CREATE TABLE dwd_homedo.dwd_sub_account_info_test (
account_id string,
account_name string,
audit_company_id string,
audit_company_name string,
org_code string,
org_name string,
is_credit string,
credit_amount string,
amount_cycle_bymonth string,
amount_cycle_byday string,
amount_cycle_type string,
is_used_amount_cycle string,
is_used_credit_amount string,
integral_balance string,
change_recommended_time string,
recommended_id string,
recommended_name string,
recommended_source string,
channel string,
flag_id string,
flag_name string,
parent_account_id string,
follower_account_id string,
ketuo_xiaoneng_id string,
aladdin_id string,
ctype string,
ctype_name string,
is_inner string,
is_handler string,
is_effect string,
is_channel string,
is_jicai string,
jicai_audit_time string,
jicai_overdue_time string,
registered_time string,
audit_time string,
audit_status string,
audit_status_name string,
account_status string,
account_status_name string,
is_VIP string,
VIP_level string
) stored as parquet
location '/opt/user/HomedoDB/warehouse/dwd_homedo/dwd_sub_account_info_test';

4. Hive 临时表数据导入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
insert overwrite table dwd_homedo.dwd_sub_account_info_test
select
account_id,
account_name,
audit_company_id,
audit_company_name,
org_code,
org_name,
is_credit,
credit_amount,
amount_cycle_bymonth,
amount_cycle_byday,
amount_cycle_type,
is_used_amount_cycle,
is_used_credit_amount,
integral_balance,
change_recommended_time,
recommended_id,
recommended_name,
recommended_source,
channel,
flag_id,
flag_name,
parent_account_id,
follower_account_id,
ketuo_xiaoneng_id,
aladdin_id,
ctype_id,
ctype_name,
is_inner,
is_handler,
is_effect,
is_channel,
is_jicai,
jicai_audit_time,
jicai_overdue_time,
registered_time,
audit_time,
audit_status,
audit_status_name,
account_status,
account_status_name,
is_VIP,
VIP_level from dwd_homedo.dwd_sub_account_info;

5. Hive 数据导入 Doris 临时表

1
2
3
4
5
6
7
8
9
10
11
use dwd_homedo_real; -- Doris 库名

LOAD LABEL label_dwd_sub_account_info1(
DATA INFILE("hdfs://hmdservice/opt/user/HomedoDB/warehouse/dwd_homedo/dwd_sub_account_info_test/*") -- hive表路径
INTO TABLE `dwd_sub_account_info_test` -- Doris 表
FORMAT AS "parquet"
)
WITH BROKER broker_name
PROPERTIES(
"timeout"="3600",
"max_filter_ratio"="0.1" );

6. Doris 数据导入运行结果查看

1
SHOW LOAD WHERE LABEL = 'label_dwd_sub_account_info1';

7. Doris 实时表数据导入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
INSERT INTO dwd_homedo_real.dwd_homedo_promotion_flag (
promotionId,
tagGroupId,
tagId,
tagValue,
mark
)
SELECT
promotionid,
taggroupid,
tagid,
tagvalue,
mark
FROM dwd_homedo_real.dwd_homedo_promotion_flag_test;

8. 数据量确认

1
2
3
4
5
6
7
select SUM(t.a)
from
(select COUNT(DISTINCT promotionId) a from dwd_homedo_real.dwd_homedo_promotion_flag group by promotionId) t;

select SUM(t.a)
from
(select COUNT(DISTINCT promotionid) a from dwd_homedo_real.dwd_homedo_promotion_flag_test group by promotionid) t;

9. 临时表删除

1
2
3
4
5
-- Hive 临时表删除
drop table dwd_homedo.dwd_sub_account_info_test;

-- Doris 临时表删除
drop table dwd_homedo_real.dwd_homedo_promotion_flag_test;