preheating the large image failed
Opened this issue · 4 comments
image name: uhub.service.ucloud.cn/openbayes_algopub/inference_llm:0.0.2
preheating the large image failed,Job details are as follows:
{
"id": 2,
"created_at": "2024-01-03T08:39:12Z",
"updated_at": "2024-01-03T08:44:41Z",
"is_del": 0,
"task_id": "group_48ad15e5-5712-4e22-a8ef-ec4a2697e824",
"bio": "",
"type": "preheat",
"state": "FAILURE",
"args": {
"filter": "Expires&Signature",
"headers": null,
"password": "Signcl2013&&",
"platform": "",
"tag": "",
"type": "image",
"url": "https://uhub.service.ucloud.cn/v2/openbayesruntimes/pytorch/manifests/1.8.2-py38-cu111.87",
"username": "shanchuan@openbayes.com"
},
"result": {
"CreatedAt": "2024-01-03T08:39:12.23073062Z",
"GroupUUID": "group_48ad15e5-5712-4e22-a8ef-ec4a2697e824",
"JobStates": [
{
"CreatedAt": "2024-01-03T08:39:12.223751052Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_5cd1395a-b9d1-4498-9040-2ec07f0f7f3d"
},
{
"CreatedAt": "2024-01-03T08:39:12.224361629Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_58bc8c0d-39b2-476c-9d04-fad017b72580"
},
{
"CreatedAt": "2024-01-03T08:39:12.224919897Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_fe297a79-25fe-4a3a-aba8-5b0922228a9f"
},
{
"CreatedAt": "2024-01-03T08:39:12.225476452Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_2f36ff98-fac6-4830-8725-2497ef5deb74"
},
{
"CreatedAt": "2024-01-03T08:39:12.226030041Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_52ac4054-bda3-421b-8610-7de7626486c8"
},
{
"CreatedAt": "2024-01-03T08:39:12.226605061Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_e3f7f235-1e1b-4a43-b846-0008ff2118d8"
},
{
"CreatedAt": "2024-01-03T08:39:12.227212191Z",
"Error": "",
"Results": null,
"State": "STARTED",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_5557a0ca-075b-49f3-9774-ce162c9374f1"
},
{
"CreatedAt": "2024-01-03T08:39:12.227804334Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_f2a36e18-967d-40da-b517-aabc7fc89569"
},
{
"CreatedAt": "2024-01-03T08:39:12.228397177Z",
"Error": "",
"Results": null,
"State": "STARTED",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_5a108e43-70c2-40a8-b27c-bee704fc5e76"
},
{
"CreatedAt": "2024-01-03T08:39:12.228908236Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_d7527787-14c8-4f98-b476-fde4b59a6571"
},
{
"CreatedAt": "2024-01-03T08:39:12.22931687Z",
"Error": "",
"Results": null,
"State": "STARTED",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_66d99522-943b-4966-b9b7-1e97a4cba588"
},
{
"CreatedAt": "2024-01-03T08:39:12.229646224Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_1d3b0598-3377-4091-93a4-7d4871c9fc9a"
},
{
"CreatedAt": "2024-01-03T08:39:12.229932206Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_42351c5c-5c3d-473b-b66f-fc80fb08a498"
},
{
"CreatedAt": "2024-01-03T08:39:12.230190035Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_551d39e1-ed35-47ef-816f-aaad7e304dc2"
},
{
"CreatedAt": "2024-01-03T08:39:12.230464886Z",
"Error": "",
"Results": null,
"State": "STARTED",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_0a4ccc77-a007-48e7-8162-558e603157c6"
},
{
"CreatedAt": "2024-01-03T08:39:12.23073062Z",
"Error": "rpc error: code = Internal desc = seed task failed: peer task failed: 4000/unexpected EOF",
"Results": null,
"State": "FAILURE",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_6a0475bb-9b0c-41bf-9f9e-234b7f876b32"
},
{
"CreatedAt": "2024-01-03T08:39:12.230986995Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_018430f5-ff34-424a-9f14-f1ee94430010"
},
{
"CreatedAt": "2024-01-03T08:39:12.231250765Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_93d2c8f5-4325-46c4-8cd9-1e4687dfb264"
},
{
"CreatedAt": "2024-01-03T08:39:12.231516108Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_59049d63-0520-4d27-8f59-7859f8cac2bf"
},
{
"CreatedAt": "2024-01-03T08:39:12.231793444Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_8ade9127-1979-457e-bbf1-1d2dbefe80ff"
},
{
"CreatedAt": "2024-01-03T08:39:12.232053527Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_eb84e69d-fb8a-4220-8667-8320a1de6635"
},
{
"CreatedAt": "2024-01-03T08:39:12.23230886Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_afabfacd-57d0-4266-86e7-f722e4a90c18"
},
{
"CreatedAt": "2024-01-03T08:39:12.232578281Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_dcbebd6a-62d2-4699-a7ab-dd1fd03b4043"
},
{
"CreatedAt": "2024-01-03T08:39:12.232823356Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_279b90e7-bf08-4f5d-a66a-ed925265f1fc"
},
{
"CreatedAt": "2024-01-03T08:39:12.233085663Z",
"Error": "",
"Results": [],
"State": "SUCCESS",
"TTL": 0,
"TaskName": "preheat",
"TaskUUID": "task_2f455382-d544-41a7-9f1b-eb14388c7e24"
}
],
"State": "FAILURE"
},
"user_id": 0,
"user": {
"id": 0,
"created_at": "0001-01-01T00:00:00Z",
"updated_at": "0001-01-01T00:00:00Z",
"is_del": 0,
"email": "",
"name": "",
"avatar": "",
"phone": "",
"state": "",
"location": "",
"bio": "",
"configs": null
},
"seed_peer_clusters": [],
"scheduler_clusters": [
{
"id": 1,
"created_at": "2024-01-03T08:13:15Z",
"updated_at": "2024-01-03T08:13:15Z",
"is_del": 0,
"name": "cluster-1",
"bio": "",
"config": {
"candidate_parent_limit": 4,
"filter_parent_limit": 40
},
"client_config": {
"concurrent_piece_count": 4,
"load_limit": 50
},
"scopes": {},
"is_default": true,
"seed_peer_clusters": null,
"schedulers": null,
"peers": null,
"jobs": null
}
]
}
The following is all the logs of manager:
manager core.log
core.log
manager gin.log
gin.log
manager grpc.log
grpc.log
manager stderr.log
stderr.log
manager stdout.log
stdout.log
The following is all the logs of scheduler:
core.log
gc.log
grpc.log
job.log
stderr.log
stdout.log
The following is all the logs of seedPeer:
core.log
gin.log
grpc.log
stderr.log
stdout.log
I filtered all the logs of dfdaemon and found no error logs
for i in `kl get pod|grep dfdaemon|awk '{print $1}'`;do echo "$i error log";kl exec -it dragonfly-dfdaemon-2zb6g cat /var/log/dragonfly/daemon/core.log|grep -i error;done
dragonfly-dfdaemon-2zb6g error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-4c5tm error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-59jpj error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-86t88 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-b5wq2 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-bgc78 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-brw29 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-bx8fx error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-c75fm error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-cffmb error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-cqvqv error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-dzwvt error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-flwkk error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-fq4rf error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-h5tg6 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-hj2mx error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-hnxt8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-k2trd error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-kqxgt error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-l4v9l error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-ltpl9 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-mfvrj error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-mlfph error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-pf6ll error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-plgt4 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-rxxb8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-sswg2 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-wlg5c error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-x6bs8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-xds4g error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-z6s8f error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-zhb5z error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-zlfct error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
@zsksy123
Seed Peer Logs:
Total number of the task's piece is 97.
{"level":"debug","ts":"2024-01-03 08:38:50.319","caller":"storage/local_storage.go:234","msg":"update total pieces: 97","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","component":"localTaskStore"}
Length of the piece is 15728640.
{"level":"debug","ts":"2024-01-03 08:39:08.721","caller":"storage/local_storage.go:182","msg":"wrote 15728640 bytes to file /var/lib/dragonfly/1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a/10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed/data, piece 1, start 15728640, length: 15728640","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","component":"localTaskStore"}
When the 68 piece writes 7434240 bytes, return the unexpected EOF
error.
{"level":"error","ts":"2024-01-03 08:48:36.544","caller":"peer/piece_manager.go:292","msg":"put piece to storage failed, piece num: 68, wrote: 7434240, error: unexpected EOF","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","component":"PeerTask","trace":"f1934cc823a9f6a835ed7d4deb7e5f78","stacktrace":"d7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).processPieceFromSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:292\nd7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).downloadKnownLengthSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:490\nd7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).DownloadSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:475\nd7y.io/dragonfly/v2/client/daemon/peer.(*peerTaskConductor).backSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/peertask_conductor.go:505\nd7y.io/dragonfly/v2/client/daemon/peer.(*peerTaskConductor).pullPieces\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/peertask_conductor.go:527"}
When downloading the 68 piece, the http range request was interrupted, which affected writing. Please check why the server connection was interrupted.