AlexiaChen/AlexiaChen.github.io

gpload报gpload needs pyyaml的错

Closed this issue · 0 comments

为了使用gpload同步数据到GreenPlum数据库,但是运行以下命令报错了:

gpload -f <filename.yaml>

因为gpload使用了python相关的pyymal的库来解析同步数据的yaml配置文件,但是它找不到了,猜测是哪里环境变量不对。

后来还运行greenplum client自带的环境变量配置的bash脚本,/usr/local/greenplum-db-clients-6.12.1/greenplum_clients_path.sh, 但是还是不行。

最后找了下,一般看两个,PYTHONHOME和PYTHONPATH这两个环境变量设置有没有问题,PYTHONPATH看到的都是greenplum客户端设置的python lib路径,没有什么异常,PYTHONHOME却是/usr/bin这样的比较奇怪的目录,于是就把它unset了,然后调用gpload就成功了。

附上自动同步OSS域数据到GP的自动化bash脚本:

#!/usr/bin/env bash

# sshpass -p "user_password" sftp user@sftp.server.com:remote_dir/*.csv local_dir/

# if on production env, please change username to 'gpadmin'
username="gpadmin"
# if you do wanna save files in test dir, please change test to ''
test="_test"

history_list=("histroy_0")
history_list_len=1000

history_push_back() {
    if [ ${#history_list[@]} == ${history_list_len} ]; then
        history_pop_front
        echo "deleting poped front txt file: ${first_element}"
        rm -f "${first_element}"
    fi
    history_list=("${history_list[@]}" "$1")
}

history_pop_front() {
    first_element=${history_list[0]}
    history_list=("${history_list[@]:1:${#history_list[@]}}")
}

check_in_history() {
    for his in "${history_list[@]}";
    do
      if [ "${his}" == "$1" ]; then
           return 0 
      fi
    done
    return 1
}

sync_data() {
    echo "resyncing data"
    begin_date=$(date '+%Y%m%d%H%M%S')
    cycle_id=${begin_date:0:14}
    if [ -n "${1}" ];then
      cycle_id=${1}
    fi

    #-----------时间变量-----------------
    year_id=${cycle_id:0:4}
    mon_id=${cycle_id:0:6}
    day_id=${cycle_id:0:8}
    hour_id=${cycle_id:8:2}
    min_id=${cycle_id:10:2}

    #etl_time="FROM_UNIXTIME(UNIX_TIMESTAMP(),'yyyyMMddHHmmss')"
    c_date=${year_id}-${mon_id:4:2}-${day_id:6:2}
    clear_day=$(date -d "$c_date -3 day " +%Y%m%d)

    if [ ${hour_id:0:1} -eq "0" ]; then
       hour_id=${hour_id:1:1}
    fi


    echo "cycle_id: ${cycle_id}"
    echo "year id: ${year_id}"
    echo "mon_id: ${mon_id}"
    echo "day_id: ${day_id}"
    echo "hour_id: ${hour_id}"
    echo "min_id: ${min_id}"

    # remote info
    r_port=22
    r_host=[remote host ip]
    r_user=[remote host username]
    r_pwd="[remote host user password]"
    r_path_h="/data01/yunnan/53/original/5g_smf/530000/${year_id}-${mon_id:4:2}-${day_id:6:2}/${hour_id}/"
    r_qts=('0' '1' '2' '3')

    gpload_host=[target greenplum host ip]
    gpload_port=[target greenplum host port]
    gpload_user=[target greenlum user]
    gpload_database=[target greenplum databasename]
    create_table_sql="CREATE TABLE IF NOT EXISTS <mysechema>.<table_name>_${day_id}(vnfid VARCHAR(100),sequence_no VARCHAR(100),starttime VARCHAR(100),endtime VARCHAR(100),operation_result VARCHAR(100),procedure_identification VARCHAR(100),protocol_cause VARCHAR(100),external_cause VARCHAR(100),fail_cause_type VARCHAR(100),fail_peernf_type VARCHAR(100),imsi VARCHAR(100),imei VARCHAR(100),msisdn VARCHAR(100),guami VARCHAR(100),pdu_session_id VARCHAR(100),qos_flow_id VARCHAR(100),linked_eps_bearer_id VARCHAR(100),eps_interworking_indication VARCHAR(100),dnn VARCHAR(100),snssai VARCHAR(100),ssc_mode VARCHAR(100),dnn_selection_mode VARCHAR(100),ladn_state VARCHAR(100),antype VARCHAR(100),rattype VARCHAR(100),tai VARCHAR(100),ncgi VARCHAR(100),gnodeb_id VARCHAR(100),ue_ipv4_address VARCHAR(100),ue_ipv6_address_central_psa VARCHAR(100),ue_ipv6_address_local_psa VARCHAR(100),local_access_type VARCHAR(100),central_psa_upf_node_id VARCHAR(100),local_psa_upf_node_id VARCHAR(100),an_ipv4_address_for_data VARCHAR(100),an_ipv6_address_for_data VARCHAR(100),charging_characteristics VARCHAR(100),charg_id VARCHAR(100),using_ul_ambr VARCHAR(100),using_dl_ambr VARCHAR(100),ul_apn_dnn_ambr_in_sub_data VARCHAR(100),dl_apn_dnn_ambr_in_sub_data VARCHAR(100),qi5_in_sub_data VARCHAR(100),arp_in_sub_data VARCHAR(100),using_5qi VARCHAR(100),using_arp VARCHAR(100),qo_sflow_qos_list VARCHAR(4000),p_provincecode VARCHAR(100),p_date VARCHAR(100),p_hour VARCHAR(100),p_quarter VARCHAR(100)) distributed BY (imsi)"

    # local info 
    l_path="/home/${username}/gpfdist/data/smf_5g${test}/${day_id}/"
    l_clear_path="/home/${username}/gpfdist/data/smf_5g${test}/${clear_day}/"
    l_unzipped_path="/home/${username}/gpfdist/txt/"
    l_gpload_path="/home/${username}/gpload"
    l_home="/home/${username}"

    echo "r_path_h: ${r_path_h}"
    echo "l_path: ${l_path}"
    echo "l_clear_path: ${l_clear_path}"

    if [ ! -d "${l_path}" ]; then
        mkdir -p "${l_path}"
    fi

    if [ ! -d "${l_unzipped_path}" ]; then
        mkdir -p "${l_unzipped_path}"
    fi

    for r_qt in "${r_qts[@]}";
    do
        r_path_qt=${r_path_h}${r_qt}/
        echo "processing r_path_qt: ${r_path_qt}"
        lftp -u ${r_user},${r_pwd} sftp://${r_host}:${r_port} <<_EOF_
            lcd ${l_path}
            cd ${r_path_qt}
            mget SMF_${day_id}*_log_node*.zip
_EOF_
        if [ $? = 0 ];then
            echo "获取文件成功:${r_path_qt}"
        else
            echo "retMes <获取文件失败:${r_path_qt}> "
        fi
    done

    for zip_file in "${l_path}"*.zip;
    do
        if test -f "${zip_file}"
        then
            
            zipfilename=$(basename "${zip_file}")
            filename=$(echo "${zipfilename}" | sed s/.zip//)
            txtfilename="${filename}.txt"
            txtfilepath="${l_unzipped_path}${txtfilename}"
            
            check_in_history "${zip_file}"
            if [ $? == 0 ]; then
                echo "skipped ${zip_file}"
                rm -f "${zip_file}"
                rm -f "${txtfilepath}"
                continue
            fi 
            
            echo "unziping ${zip_file}"
            unzip -n "${zip_file}" -d ${l_unzipped_path}
           
            cd ${l_gpload_path}
            sed -i -e "s|txtfilepath|${txtfilepath}|g"  ./bb.yaml
            sed -i -e "s|dayid|${day_id}|g"  ./bb.yaml
            echo "syncing txt file: ${txtfilepath}"
            # create table by date
            echo "runing before sql: ${create_table_sql}"
            psql -h ${gpload_host} -p ${gpload_port} -U ${gpload_user} -d ${gpload_database} -c "${create_table_sql}"
            gpload -f ./bb.yaml
            sed -i -E -e  "s/(\/.*\/)(.*\.txt)/txtfilepath/"  ./bb.yaml
            sed -i -e "s|${day_id}|dayid|g"  ./bb.yaml

            cd ${l_home}

            echo "sync finished: ${txtfilepath}"

            history_push_back "${zip_file}"
            rm -f "${zip_file}"
            rm -f "${txtfilepath}"
        
        fi
    done

    echo "deleting ${l_clear_path}"
    rm -rf "${l_clear_path}"
}

run_sync() {
    while :
    do
      sync_data "${1}"
      echo "waiting resync..."
      sleep 5s
    done
}

echo "running sync"
run_sync "$1"