cosmos/cosmos-sdk

Gaia-7003 halt stack trace

zmanian opened this issue · 8 comments

E[07-29|15:20:36.482] CONSENSUS FAILURE!!! module=consensus err="should not already be unbonded, validator: {974A7590D8BC1B9D323F7A9770D5388D324DA0E7 PubKeyEd25519{C1D509AC04DB20DEC2F698330F3681678580350CD02FEFEBFB7EB25EA0DD7824} false 0 5/1 5/1 {fullnode04 [do-not-modify] [do-not-modify] [do-not-modify]} 27237 0 0/1 0/1 0/1 0/1 0/1}\n" stack="goroutine 131181 [running]:\nruntime/debug.Stack(0xc427b0bb58, 0xd014a0, 0xc424bc1cc0)\n\t/snap/go/2130/src/runtime/debug/stack.go:24 +0xa7\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine.func1(0xc420112600)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:558 +0x57\npanic(0xd014a0, 0xc424bc1cc0)\n\t/snap/go/2130/src/runtime/panic.go:502 +0x229\ngithub.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.unbondValidator(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:487 +0x797\ngithub.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.UpdateBondedValidators(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:384 +0x867\ngithub.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.UpdateValidator(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:238 +0x6db\ngithub.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.Slash(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/slash.go:103 +0xccc\ngithub.com/cosmos/cosmos-sdk/x/slashing.Keeper.handleValidatorSignature(0x105db80, 0xc42003ef30, 0xc4200e2460, 0x106bf20, 0xc420080880, 0xa, 0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/slashing/keeper.go:106 +0x7ea\ngithub.com/cosmos/cosmos-sdk/x/slashing.BeginBlocker(0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/slashing/tick.go:28 +0x1d8\ngithub.com/cosmos/cosmos-sdk/cmd/gaia/app.(*GaiaApp).BeginBlocker(0xc4207c21a0, 0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/cmd/gaia/app/app.go:131 +0xc3\ngithub.com/cosmos/cosmos-sdk/cmd/gaia/app.(*GaiaApp).BeginBlocker-fm(0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/cmd/gaia/app/app.go:103 +0xa0\ngithub.com/cosmos/cosmos-sdk/baseapp.(*BaseApp).BeginBlock(0xc42047e180, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/baseapp/baseapp.go:431 +0x1ef\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/abci/client.(*localClient).BeginBlockSync(0xc420085a40, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/abci/client/local_client.go:206 +0xab\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/proxy.(*appConnConsensus).BeginBlockSync(0xc4208ab440, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/proxy/app_conn.go:69 +0x78\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state.execBlockOnProxyApp(0x1066440, 0xc421395b20, 0x106b2c0, 0xc4208ab440, 0xc4229381a0, 0xc42b74e330, 0x106f4e0, 0xc4200b8850, 0x1, 0xc42122a240, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state/execution.go:190 +0x53b\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state.(*BlockExecutor).ApplyBlock(0xc42233d260, 0xc4223898d0, 0x9, 0x6de3, 0x520, 0xc42c80eb60, 0x14, 0x20, 0x1, 0xc42122a240, ...)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state/execution.go:76 +0x12f\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).finalizeCommit(0xc420112600, 0x6de4)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1290 +0xba6\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryFinalizeCommit(0xc420112600, 0x6de4)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1221 +0x468\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit.func1(0xc420112600, 0x1, 0x6de4)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1169 +0x98\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit(0xc420112600, 0x6de4, 0x1)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1198 +0x802\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).addVote(0xc420112600, 0xc426d495e0, 0xc4273f9dd0, 0x28, 0xfc, 0x0, 0x0)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1601 +0xbb4\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryAddVote(0xc420112600, 0xc426d495e0, 0xc4273f9dd0, 0x28, 0xfc, 0xf4)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1459 +0x56\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).handleMsg(0xc420112600, 0xd3e4a0, 0xc426b0d2d8, 0xc4273f9dd0, 0x28)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:628 +0x64f\ngithub.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine(0xc420112600, 0x0)\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:580 +0x6d2\ncreated by github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).OnStart\n\t/home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:295 +0x140\n"

Issues started happening after we hit the max limit of validators and some started getting ejected.

The validator highlighted in the stacktrace got ejected 650 blocks before network halted.

I've also been discussing with another validator and we found another bug. When you are ejected, you are not erased from the extended validator set. We tried to delegate more steak from his validator address to his validator address (because there is no self-bond command), in order to try and come back in the active val set. The tx was committed, he lost some steak. But validator power remained unchanged (although the amount of steak was sufficient to come back in the val set at the time). So some steak got lost at some point...

I don't know what "Fullnode04" did to cause the failure, if he did anything besides being ejected, but we potentially have multiple issues on our hand related to validator cliff/validator candidates.

This fault is probably a result of #1839, but should be confirmed by state examination.

I've also been discussing with another validator and we found another bug. When you are ejected, you are not erased from the extended validator set. We tried to delegate more steak from his validator address to his validator address (because there is no self-bond command), in order to try and come back in the active val set. The tx was committed, he lost some steak. But validator power remained unchanged (although the amount of steak was sufficient to come back in the val set at the time). So some steak got lost at some point...

This might be another symptom of the same cause but we should make sure - can you/he provide exact information about which validator address, which transactions, etc so we can debug?

There were two transactions. I'm posting them below @cwgoes.

First tx:

gaiacli stake delegate --address-delegator cosmosaccaddr18thamkhnj9wz8pa4nhnp9rldprgant57ryzag7 --address-validator cosmosaccaddr18thamkhnj9wz8pa4nhnp9rldprgant57ryzag7 --chain-id=gaia-7003 --from=delegate-now --amount 40steak
Defaulting to account number: 212
Defaulting to next sequence number: 3
Password to sign with 'delegate-now':
Committed at block 27480. Hash: C9D19EC5B2188BDDAB06B09C6FC038EA022A8C1F

Second tx:

gaiacli stake delegate --address-delegator cosmosaccaddr18thamkhnj9wz8pa4nhnp9rldprgant57ryzag7 --address-validator cosmosaccaddr18thamkhnj9wz8pa4nhnp9rldprgant57ryzag7 --chain-id=gaia-7003 --from=delegate-now --amount 100steak
Defaulting to account number: 212
Defaulting to next sequence number: 2
Password to sign with 'delegate-now':
Committed at block 27438. Hash: 5D1175FC2078513CDFA129C94900B9EF541B10EF

@gamarin2

I've also been discussing with another validator and we found another bug. When you are ejected, you are not erased from the extended validator set.

Can you open up a bug report for these points? I think this is probably a separate issue

Can you open up a bug report for these points? I think this is probably a separate issue

I believe this is normal behaviour re:not being ejected. When you are ejected from active val set because you have insufficient voting power, then you become unbonding (or unbonded?). But you should not be erased.

Will open an issue for the other

Reformatted easier-to-read stack trace:

E[07-29|15:20:36.482] CONSENSUS FAILURE!!!                         module=consensus err="should not already be unbonded,  validator: {974A7590D8BC1B9D323F7A9770D5388D324DA0E7 PubKeyEd25519{C1D509AC04DB20DEC2F698330F3681678580350CD02FEFEBFB7EB25EA0DD7824} false 0 5/1 5/1 {fullnode04 [do-not-modify] [do-not-modify] [do-not-modify]} 27237 0  0/1 0/1 0/1 0/1 0/1}
" stack="goroutine 131181 [running]:
runtime/debug.Stack(0xc427b0bb58, 0xd014a0, 0xc424bc1cc0)
  /snap/go/2130/src/runtime/debug/stack.go:24 +0xa7
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine.func1(0xc420112600)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:558 +0x57
panic(0xd014a0, 0xc424bc1cc0)
  /snap/go/2130/src/runtime/panic.go:502 +0x229
github.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.unbondValidator(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:487 +0x797
github.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.UpdateBondedValidators(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:384 +0x867
github.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.UpdateValidator(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/validator.go:238 +0x6db
github.com/cosmos/cosmos-sdk/x/stake/keeper.Keeper.Slash(0x105db80, 0xc42003ef10, 0xc4200e2460, 0x105db80, 0xc42003eef0, 0xfe7850, 0xc4200e2460, 0x4, 0x1065880, 0xc4274170b0, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/stake/keeper/slash.go:103 +0xccc
github.com/cosmos/cosmos-sdk/x/slashing.Keeper.handleValidatorSignature(0x105db80, 0xc42003ef30, 0xc4200e2460, 0x106bf20, 0xc420080880, 0xa, 0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/slashing/keeper.go:106 +0x7ea
github.com/cosmos/cosmos-sdk/x/slashing.BeginBlocker(0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/x/slashing/tick.go:28 +0x1d8
github.com/cosmos/cosmos-sdk/cmd/gaia/app.(*GaiaApp).BeginBlocker(0xc4207c21a0, 0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/cmd/gaia/app/app.go:131 +0xc3
github.com/cosmos/cosmos-sdk/cmd/gaia/app.(*GaiaApp).BeginBlocker-fm(0x1065880, 0xc4274170b0, 0xc425699a00, 0x9, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/cmd/gaia/app/app.go:103 +0xa0
github.com/cosmos/cosmos-sdk/baseapp.(*BaseApp).BeginBlock(0xc42047e180, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/baseapp/baseapp.go:431 +0x1ef
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/abci/client.(*localClient).BeginBlockSync(0xc420085a40, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/abci/client/local_client.go:206 +0xab
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/proxy.(*appConnConsensus).BeginBlockSync(0xc4208ab440, 0xc427ac0220, 0x14, 0x20, 0xc4217867c0, 0x9, 0x6de4, 0x5b5ddb43, 0x0, 0x520, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/proxy/app_conn.go:69 +0x78
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state.execBlockOnProxyApp(0x1066440, 0xc421395b20, 0x106b2c0, 0xc4208ab440, 0xc4229381a0, 0xc42b74e330, 0x106f4e0, 0xc4200b8850, 0x1, 0xc42122a240, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state/execution.go:190 +0x53b
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state.(*BlockExecutor).ApplyBlock(0xc42233d260, 0xc4223898d0, 0x9, 0x6de3, 0x520, 0xc42c80eb60, 0x14, 0x20, 0x1, 0xc42122a240, ...)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/state/execution.go:76 +0x12f
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).finalizeCommit(0xc420112600, 0x6de4)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1290 +0xba6
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryFinalizeCommit(0xc420112600, 0x6de4)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1221 +0x468
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit.func1(0xc420112600, 0x1, 0x6de4)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1169 +0x98
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit(0xc420112600, 0x6de4, 0x1)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1198 +0x802
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).addVote(0xc420112600, 0xc426d495e0, 0xc4273f9dd0, 0x28, 0xfc, 0x0, 0x0)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1601 +0xbb4
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryAddVote(0xc420112600, 0xc426d495e0, 0xc4273f9dd0, 0x28, 0xfc, 0xf4)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:1459 +0x56
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).handleMsg(0xc420112600, 0xd3e4a0, 0xc426b0d2d8, 0xc4273f9dd0, 0x28)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:628 +0x64f
github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine(0xc420112600, 0x0)
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:580 +0x6d2
created by github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).OnStart
  /home/zaki/go/src/github.com/cosmos/cosmos-sdk/vendor/github.com/tendermint/tendermint/consensus/state.go:295 +0x140

See the postmortem.