gpload报gpload needs pyyaml的错
Closed this issue · 0 comments
AlexiaChen commented
为了使用gpload同步数据到GreenPlum数据库,但是运行以下命令报错了:
gpload -f <filename.yaml>
因为gpload使用了python相关的pyymal的库来解析同步数据的yaml配置文件,但是它找不到了,猜测是哪里环境变量不对。
后来还运行greenplum client自带的环境变量配置的bash脚本,/usr/local/greenplum-db-clients-6.12.1/greenplum_clients_path.sh
, 但是还是不行。
最后找了下,一般看两个,PYTHONHOME和PYTHONPATH这两个环境变量设置有没有问题,PYTHONPATH看到的都是greenplum客户端设置的python lib路径,没有什么异常,PYTHONHOME却是/usr/bin这样的比较奇怪的目录,于是就把它unset了,然后调用gpload就成功了。
附上自动同步OSS域数据到GP的自动化bash脚本:
#!/usr/bin/env bash
# sshpass -p "user_password" sftp user@sftp.server.com:remote_dir/*.csv local_dir/
# if on production env, please change username to 'gpadmin'
username="gpadmin"
# if you do wanna save files in test dir, please change test to ''
test="_test"
history_list=("histroy_0")
history_list_len=1000
history_push_back() {
if [ ${#history_list[@]} == ${history_list_len} ]; then
history_pop_front
echo "deleting poped front txt file: ${first_element}"
rm -f "${first_element}"
fi
history_list=("${history_list[@]}" "$1")
}
history_pop_front() {
first_element=${history_list[0]}
history_list=("${history_list[@]:1:${#history_list[@]}}")
}
check_in_history() {
for his in "${history_list[@]}";
do
if [ "${his}" == "$1" ]; then
return 0
fi
done
return 1
}
sync_data() {
echo "resyncing data"
begin_date=$(date '+%Y%m%d%H%M%S')
cycle_id=${begin_date:0:14}
if [ -n "${1}" ];then
cycle_id=${1}
fi
#-----------时间变量-----------------
year_id=${cycle_id:0:4}
mon_id=${cycle_id:0:6}
day_id=${cycle_id:0:8}
hour_id=${cycle_id:8:2}
min_id=${cycle_id:10:2}
#etl_time="FROM_UNIXTIME(UNIX_TIMESTAMP(),'yyyyMMddHHmmss')"
c_date=${year_id}-${mon_id:4:2}-${day_id:6:2}
clear_day=$(date -d "$c_date -3 day " +%Y%m%d)
if [ ${hour_id:0:1} -eq "0" ]; then
hour_id=${hour_id:1:1}
fi
echo "cycle_id: ${cycle_id}"
echo "year id: ${year_id}"
echo "mon_id: ${mon_id}"
echo "day_id: ${day_id}"
echo "hour_id: ${hour_id}"
echo "min_id: ${min_id}"
# remote info
r_port=22
r_host=[remote host ip]
r_user=[remote host username]
r_pwd="[remote host user password]"
r_path_h="/data01/yunnan/53/original/5g_smf/530000/${year_id}-${mon_id:4:2}-${day_id:6:2}/${hour_id}/"
r_qts=('0' '1' '2' '3')
gpload_host=[target greenplum host ip]
gpload_port=[target greenplum host port]
gpload_user=[target greenlum user]
gpload_database=[target greenplum databasename]
create_table_sql="CREATE TABLE IF NOT EXISTS <mysechema>.<table_name>_${day_id}(vnfid VARCHAR(100),sequence_no VARCHAR(100),starttime VARCHAR(100),endtime VARCHAR(100),operation_result VARCHAR(100),procedure_identification VARCHAR(100),protocol_cause VARCHAR(100),external_cause VARCHAR(100),fail_cause_type VARCHAR(100),fail_peernf_type VARCHAR(100),imsi VARCHAR(100),imei VARCHAR(100),msisdn VARCHAR(100),guami VARCHAR(100),pdu_session_id VARCHAR(100),qos_flow_id VARCHAR(100),linked_eps_bearer_id VARCHAR(100),eps_interworking_indication VARCHAR(100),dnn VARCHAR(100),snssai VARCHAR(100),ssc_mode VARCHAR(100),dnn_selection_mode VARCHAR(100),ladn_state VARCHAR(100),antype VARCHAR(100),rattype VARCHAR(100),tai VARCHAR(100),ncgi VARCHAR(100),gnodeb_id VARCHAR(100),ue_ipv4_address VARCHAR(100),ue_ipv6_address_central_psa VARCHAR(100),ue_ipv6_address_local_psa VARCHAR(100),local_access_type VARCHAR(100),central_psa_upf_node_id VARCHAR(100),local_psa_upf_node_id VARCHAR(100),an_ipv4_address_for_data VARCHAR(100),an_ipv6_address_for_data VARCHAR(100),charging_characteristics VARCHAR(100),charg_id VARCHAR(100),using_ul_ambr VARCHAR(100),using_dl_ambr VARCHAR(100),ul_apn_dnn_ambr_in_sub_data VARCHAR(100),dl_apn_dnn_ambr_in_sub_data VARCHAR(100),qi5_in_sub_data VARCHAR(100),arp_in_sub_data VARCHAR(100),using_5qi VARCHAR(100),using_arp VARCHAR(100),qo_sflow_qos_list VARCHAR(4000),p_provincecode VARCHAR(100),p_date VARCHAR(100),p_hour VARCHAR(100),p_quarter VARCHAR(100)) distributed BY (imsi)"
# local info
l_path="/home/${username}/gpfdist/data/smf_5g${test}/${day_id}/"
l_clear_path="/home/${username}/gpfdist/data/smf_5g${test}/${clear_day}/"
l_unzipped_path="/home/${username}/gpfdist/txt/"
l_gpload_path="/home/${username}/gpload"
l_home="/home/${username}"
echo "r_path_h: ${r_path_h}"
echo "l_path: ${l_path}"
echo "l_clear_path: ${l_clear_path}"
if [ ! -d "${l_path}" ]; then
mkdir -p "${l_path}"
fi
if [ ! -d "${l_unzipped_path}" ]; then
mkdir -p "${l_unzipped_path}"
fi
for r_qt in "${r_qts[@]}";
do
r_path_qt=${r_path_h}${r_qt}/
echo "processing r_path_qt: ${r_path_qt}"
lftp -u ${r_user},${r_pwd} sftp://${r_host}:${r_port} <<_EOF_
lcd ${l_path}
cd ${r_path_qt}
mget SMF_${day_id}*_log_node*.zip
_EOF_
if [ $? = 0 ];then
echo "获取文件成功:${r_path_qt}"
else
echo "retMes <获取文件失败:${r_path_qt}> "
fi
done
for zip_file in "${l_path}"*.zip;
do
if test -f "${zip_file}"
then
zipfilename=$(basename "${zip_file}")
filename=$(echo "${zipfilename}" | sed s/.zip//)
txtfilename="${filename}.txt"
txtfilepath="${l_unzipped_path}${txtfilename}"
check_in_history "${zip_file}"
if [ $? == 0 ]; then
echo "skipped ${zip_file}"
rm -f "${zip_file}"
rm -f "${txtfilepath}"
continue
fi
echo "unziping ${zip_file}"
unzip -n "${zip_file}" -d ${l_unzipped_path}
cd ${l_gpload_path}
sed -i -e "s|txtfilepath|${txtfilepath}|g" ./bb.yaml
sed -i -e "s|dayid|${day_id}|g" ./bb.yaml
echo "syncing txt file: ${txtfilepath}"
# create table by date
echo "runing before sql: ${create_table_sql}"
psql -h ${gpload_host} -p ${gpload_port} -U ${gpload_user} -d ${gpload_database} -c "${create_table_sql}"
gpload -f ./bb.yaml
sed -i -E -e "s/(\/.*\/)(.*\.txt)/txtfilepath/" ./bb.yaml
sed -i -e "s|${day_id}|dayid|g" ./bb.yaml
cd ${l_home}
echo "sync finished: ${txtfilepath}"
history_push_back "${zip_file}"
rm -f "${zip_file}"
rm -f "${txtfilepath}"
fi
done
echo "deleting ${l_clear_path}"
rm -rf "${l_clear_path}"
}
run_sync() {
while :
do
sync_data "${1}"
echo "waiting resync..."
sleep 5s
done
}
echo "running sync"
run_sync "$1"