Doris 使用 sqoop 将 Doris 数据导入到 HDFS

使用 sqoop 将 Doris 数据导入到 HDFS,

HDFS 数据更新到 Hive 表分区。


Doris -> HDFS 临时路径

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/opt/cloudera/parcels/CDH/bin/sqoop-import \
--connect "jdbc:mysql://10.0.15.131:9030/dwd_homedo_ubt" \
--username root \
--password "root" \
--target-dir /opt/user/HomedoDB/warehouse/ods_cdp/mid_event_vd/d=20210729 \
--query "select distinct_id, xwho, xwhen, xwhat, os, utm_campaign_id, utm_campaign, utm_medium, dict_utm_medium, utm_source, dict_utm_source,
utm_content, utm_term, channel, referrer, referrer_domain, traffic_source_type, dict_traffic_source_type, search_engine, dict_search_engine,
search_keyword, social_share_from, scene, dict_scene, scene_type, dict_scene_type, platform, dict_platform, session_id, url, social_media,
dict_social_media, url_domain, track_xwhen, fingerprint, campaign_shortlink, offset, xwhat_id, need_update, network, os_version, is_first_day,
is_from_background, model, screen_width, is_first_time, brand, ip, screen_height, language, app_version, lib, is_time_calibrated, is_login,
lib_version, platform_extra, time_zone, manufacturer, country, province, city, title, duration, element_type, element_path, start_source,
element_content, element_id, carrier_name, element_position, parent_url, web_crawler, user_agent, browser_version, device_type,
dict_device_type, browser, startup_time, original_id, url_path, element_class_name, element_target_url, btn_name, type, content_type,
nav_name, \`desc\`, is_success, method, user_id, source_position_name, source_module, rank, name, module_name, link_page_url, ad_name,
ad_id, location, element_name, third_category, news_id, news_name, event_name, second_category, first_category, price, item_id, item_name,
item_count, keyword_type, min_buy_unit, review_num, list_price, event_id, reason, brand_id, shop_name, shop_id, brand_name, video_name,
video_id, is_filter, item_code, item_specific_module, shop_num, recommend_item_num, page_num, filter_detail, filter_type, crash_data,
string_price, second_category_id, stock_status_pro, first_category_id, stock_status, shelf_status, item_u8_code, string_price_pro,
third_category_id, min_buy_num, string_stp_price2, string_stp_price1, string_stp_price3, string_stp_num3, string_stp_num2, string_stp_num1,
string_stp_price4, string_stp_num4, string_stp_price5, string_stp_num5, pic_num, integral_use_amount, shipping_cost, orgcode, credit_period_type,
is_wait_deliver, insert_time, actual_pay_amount, coupon_freight_use_amount, is_negotiation, package_use_amount, discount_use_amount,
channel_type, order_id, category_code, payment_time, is_status, payment_method, per_pay_amount, is_first_order, status, smb_pay_time, order_amount,
confirm_time, action, viewport_position, viewport_width, viewport_height, event_duration, deal_amount, live_name, live_id, debug, news_channel,
news_subject, radio_num, plan_name, plan_id, click_content_rank, activity_tag, string_list_price, is_delete, device_id, oaid, imei, android_id,
element_clickable, click_x, click_y, page_height, element_y, element_x, page_width, push_title, activityid, activityname, activitytype, virtualurl,
prop_0, prop_1, prop_2, prop_3, prop_4, dict_platform_extra, page_name, sms_channel, message_id, ea_cohort_code, ad_space_type, ad_space_name,
grade1, ad_space_id, dese, item_type, package_num, package_id, pagestaytime, is_ad_sku, sku_rank, package_name, url_domain2, search_scene,
share_source_id, static_type
from dwd_homedo_ubt.dwd_log_event
where date_d = '2021-07-29'
and \$CONDITIONS" \
--delete-target-dir \
--fields-terminated-by '\001' \
--hive-delims-replacement '' \
--null-string '\\N' \
--null-non-string '\\N' \
--m 1

Hive 临时表 -> Hive 正式表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
hive -e "set hive.exec.dynamic.partition.mode=nonstrict;
set hive.execution.engine=mr;
msck repair table ods_cdp.mid_event_vd;
insert overwrite table ods_cdp.event_vd
partition(d)
select distinct_id, xwho, xwhen, xwhat, os, utm_campaign_id, utm_campaign, utm_medium, dict_utm_medium, utm_source, dict_utm_source,
utm_content, utm_term, channel, referrer, referrer_domain, traffic_source_type, dict_traffic_source_type, search_engine, dict_search_engine,
search_keyword, social_share_from, scene, dict_scene, scene_type, dict_scene_type, platform, dict_platform, session_id, url, social_media,
dict_social_media, url_domain, track_xwhen, fingerprint, campaign_shortlink, offset, xwhat_id, need_update, network, os_version, is_first_day,
is_from_background, model, screen_width, is_first_time, brand, ip, screen_height, language, app_version, lib, is_time_calibrated, is_login,
lib_version, platform_extra, time_zone, manufacturer, country, province, city, title, duration, element_type, element_path, start_source,
element_content, element_id, carrier_name, element_position, parent_url, web_crawler, user_agent, browser_version, device_type,
dict_device_type, browser, startup_time, original_id, url_path, element_class_name, element_target_url, btn_name, type, content_type,
nav_name, desc, is_success, method, user_id, source_position_name, source_module, rank, name, module_name, link_page_url, ad_name,
ad_id, location, element_name, third_category, news_id, news_name, event_name, second_category, first_category, price, item_id, item_name,
item_count, keyword_type, min_buy_unit, review_num, list_price, event_id, reason, brand_id, shop_name, shop_id, brand_name, video_name,
video_id, is_filter, item_code, item_specific_module, shop_num, recommend_item_num, page_num, filter_detail, filter_type, crash_data,
string_price, second_category_id, stock_status_pro, first_category_id, stock_status, shelf_status, item_u8_code, string_price_pro,
third_category_id, min_buy_num, string_stp_price2, string_stp_price1, string_stp_price3, string_stp_num3, string_stp_num2, string_stp_num1,
string_stp_price4, string_stp_num4, string_stp_price5, string_stp_num5, pic_num, integral_use_amount, shipping_cost, orgcode, credit_period_type,
is_wait_deliver, insert_time, actual_pay_amount, coupon_freight_use_amount, is_negotiation, package_use_amount, discount_use_amount,
channel_type, order_id, category_code, payment_time, is_status, payment_method, per_pay_amount, is_first_order, status, smb_pay_time, order_amount,
confirm_time, action, viewport_position, viewport_width, viewport_height, event_duration, deal_amount, live_name, live_id, debug, news_channel,
news_subject, radio_num, plan_name, plan_id, click_content_rank, activity_tag, string_list_price, is_delete, device_id, oaid, imei, android_id,
element_clickable, click_x, click_y, page_height, element_y, element_x, page_width, push_title, activityid, activityname, activitytype, virtualurl,
prop_0, prop_1, prop_2, prop_3, prop_4, dict_platform_extra, page_name, sms_channel, message_id, ea_cohort_code, ad_space_type, ad_space_name,
grade1, ad_space_id, dese, item_type, package_num, package_id, pagestaytime, is_ad_sku, sku_rank, package_name, url_domain2, search_scene,
share_source_id, static_type, d
from ods_cdp.mid_event_vd
where d = '20210729';"

删除临时表分区数据

1
hive -e "alter table ods_cdp.mid_event_vd drop partition (d = '20210729');"

自动化调度脚本

azkaban调度任务Zip内容

azkaban.project

1
azkaban-flow-version: 2.0

sync_ubt.flow

1
2
3
4
5
nodes:
- name: sqoop_event_vd
type: command
config:
command: sh /opt/sync/sync_script/ods_cdp/sqoop_event_vd.sh

sqoop_event_vd.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash

deleteDate=`date -d "1 day ago" +%Y%m%d`

for ((i=1; i<=11; i++))

do

dorisDate=`date -d "$i day ago" +%Y-%m-%d`

hiveDate=`date -d "$i day ago" +%Y%m%d`

/opt/cloudera/parcels/CDH/bin/sqoop-import \
--connect "jdbc:mysql://10.0.15.131:9030/dwd_homedo_ubt" \
--username root \
--password "root" \
--target-dir /opt/user/HomedoDB/warehouse/ods_cdp/mid_event_vd/d=$hiveDate \
--query "select distinct_id, xwho, xwhen, xwhat, os, utm_campaign_id, utm_campaign, utm_medium, dict_utm_medium, utm_source, dict_utm_source,
utm_content, utm_term, channel, referrer, referrer_domain, traffic_source_type, dict_traffic_source_type, search_engine, dict_search_engine,
search_keyword, social_share_from, scene, dict_scene, scene_type, dict_scene_type, platform, dict_platform, session_id, url, social_media,
dict_social_media, url_domain, track_xwhen, fingerprint, campaign_shortlink, offset, xwhat_id, need_update, network, os_version, is_first_day,
is_from_background, model, screen_width, is_first_time, brand, ip, screen_height, language, app_version, lib, is_time_calibrated, is_login,
lib_version, platform_extra, time_zone, manufacturer, country, province, city, title, duration, element_type, element_path, start_source,
element_content, element_id, carrier_name, element_position, parent_url, web_crawler, user_agent, browser_version, device_type,
dict_device_type, browser, startup_time, original_id, url_path, element_class_name, element_target_url, btn_name, type, content_type,
nav_name, \`desc\`, is_success, method, user_id, source_position_name, source_module, rank, name, module_name, link_page_url, ad_name,
ad_id, location, element_name, third_category, news_id, news_name, event_name, second_category, first_category, price, item_id, item_name,
item_count, keyword_type, min_buy_unit, review_num, list_price, event_id, reason, brand_id, shop_name, shop_id, brand_name, video_name,
video_id, is_filter, item_code, item_specific_module, shop_num, recommend_item_num, page_num, filter_detail, filter_type, crash_data,
string_price, second_category_id, stock_status_pro, first_category_id, stock_status, shelf_status, item_u8_code, string_price_pro,
third_category_id, min_buy_num, string_stp_price2, string_stp_price1, string_stp_price3, string_stp_num3, string_stp_num2, string_stp_num1,
string_stp_price4, string_stp_num4, string_stp_price5, string_stp_num5, pic_num, integral_use_amount, shipping_cost, orgcode, credit_period_type,
is_wait_deliver, insert_time, actual_pay_amount, coupon_freight_use_amount, is_negotiation, package_use_amount, discount_use_amount,
channel_type, order_id, category_code, payment_time, is_status, payment_method, per_pay_amount, is_first_order, status, smb_pay_time, order_amount,
confirm_time, action, viewport_position, viewport_width, viewport_height, event_duration, deal_amount, live_name, live_id, debug, news_channel,
news_subject, radio_num, plan_name, plan_id, click_content_rank, activity_tag, string_list_price, is_delete, device_id, oaid, imei, android_id,
element_clickable, click_x, click_y, page_height, element_y, element_x, page_width, push_title, activityid, activityname, activitytype, virtualurl,
prop_0, prop_1, prop_2, prop_3, prop_4, dict_platform_extra, page_name, sms_channel, message_id, ea_cohort_code, ad_space_type, ad_space_name,
grade1, ad_space_id, dese, item_type, package_num, package_id, pagestaytime, is_ad_sku, sku_rank, package_name, url_domain2, search_scene,
share_source_id, static_type
from dwd_homedo_ubt.dwd_log_event
where date_d = '$dorisDate'
and \$CONDITIONS" \
--delete-target-dir \
--fields-terminated-by '\001' \
--hive-delims-replacement '' \
--null-string '\\N' \
--null-non-string '\\N' \
--m 1

hive -e "set hive.exec.dynamic.partition.mode=nonstrict;
set hive.execution.engine=mr;
msck repair table ods_cdp.mid_event_vd;
insert overwrite table ods_cdp.event_vd
partition(d)
select distinct_id, xwho, xwhen, xwhat, os, utm_campaign_id, utm_campaign, utm_medium, dict_utm_medium, utm_source, dict_utm_source,
utm_content, utm_term, channel, referrer, referrer_domain, traffic_source_type, dict_traffic_source_type, search_engine, dict_search_engine,
search_keyword, social_share_from, scene, dict_scene, scene_type, dict_scene_type, platform, dict_platform, session_id, url, social_media,
dict_social_media, url_domain, track_xwhen, fingerprint, campaign_shortlink, offset, xwhat_id, need_update, network, os_version, is_first_day,
is_from_background, model, screen_width, is_first_time, brand, ip, screen_height, language, app_version, lib, is_time_calibrated, is_login,
lib_version, platform_extra, time_zone, manufacturer, country, province, city, title, duration, element_type, element_path, start_source,
element_content, element_id, carrier_name, element_position, parent_url, web_crawler, user_agent, browser_version, device_type,
dict_device_type, browser, startup_time, original_id, url_path, element_class_name, element_target_url, btn_name, type, content_type,
nav_name, desc, is_success, method, user_id, source_position_name, source_module, rank, name, module_name, link_page_url, ad_name,
ad_id, location, element_name, third_category, news_id, news_name, event_name, second_category, first_category, price, item_id, item_name,
item_count, keyword_type, min_buy_unit, review_num, list_price, event_id, reason, brand_id, shop_name, shop_id, brand_name, video_name,
video_id, is_filter, item_code, item_specific_module, shop_num, recommend_item_num, page_num, filter_detail, filter_type, crash_data,
string_price, second_category_id, stock_status_pro, first_category_id, stock_status, shelf_status, item_u8_code, string_price_pro,
third_category_id, min_buy_num, string_stp_price2, string_stp_price1, string_stp_price3, string_stp_num3, string_stp_num2, string_stp_num1,
string_stp_price4, string_stp_num4, string_stp_price5, string_stp_num5, pic_num, integral_use_amount, shipping_cost, orgcode, credit_period_type,
is_wait_deliver, insert_time, actual_pay_amount, coupon_freight_use_amount, is_negotiation, package_use_amount, discount_use_amount,
channel_type, order_id, category_code, payment_time, is_status, payment_method, per_pay_amount, is_first_order, status, smb_pay_time, order_amount,
confirm_time, action, viewport_position, viewport_width, viewport_height, event_duration, deal_amount, live_name, live_id, debug, news_channel,
news_subject, radio_num, plan_name, plan_id, click_content_rank, activity_tag, string_list_price, is_delete, device_id, oaid, imei, android_id,
element_clickable, click_x, click_y, page_height, element_y, element_x, page_width, push_title, activityid, activityname, activitytype, virtualurl,
prop_0, prop_1, prop_2, prop_3, prop_4, dict_platform_extra, page_name, sms_channel, message_id, ea_cohort_code, ad_space_type, ad_space_name,
grade1, ad_space_id, dese, item_type, package_num, package_id, pagestaytime, is_ad_sku, sku_rank, package_name, url_domain2, search_scene,
share_source_id, static_type, d
from ods_cdp.mid_event_vd
where d = '$hiveDate';"

done

hive -e "alter table ods_cdp.mid_event_vd drop partition (d < '$deleteDate');"