feat(consensus-policy): mock_chain_validation — Swallow* surface + validation refactor (M1)#3429
feat(consensus-policy): mock_chain_validation — Swallow* surface + validation refactor (M1)#3429bdchatham wants to merge 4 commits into
Conversation
|
You have used all Bugbot PR reviews included in your free trial for your GitHub account on this workspace. To continue using Bugbot reviews, enable Bugbot for your team in the Cursor dashboard. |
|
The latest Buf updates on your PR. Results from workflow Buf / buf (pull_request).
|
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## main #3429 +/- ##
==========================================
+ Coverage 59.34% 59.37% +0.02%
==========================================
Files 2112 2112
Lines 174772 174903 +131
==========================================
+ Hits 103724 103851 +127
Misses 62010 62010
- Partials 9038 9042 +4
Flags with carried forward coverage won't be shown. Click here to find out more.
🚀 New features to boost your workflow:
|
| @@ -87,21 +136,36 @@ func validateBlock(state State, block *types.Block, policy types.ConsensusPolicy | |||
| return errors.New("initial block can't have LastCommit signatures") | |||
| } | |||
| } else { | |||
| // LastCommit.Signatures length is checked in VerifyCommit. | |||
| // LastCommit.Signatures length is checked in VerifyCommit. Audit | |||
| // row 12 — load-bearing block-1 blocker for forked-boot with a new | |||
| // validator set; swallow-eligible via SwallowLastCommitVerifyFailure. | |||
| if err := state.LastValidators.VerifyCommit( | |||
| state.ChainID, state.LastBlockID, block.Height-1, block.LastCommit); err != nil { | |||
| return fmt.Errorf("VerifyCommit(): %w", err) | |||
| wrapped := fmt.Errorf("VerifyCommit(): %w", err) | |||
| if policy.SwallowLastCommitVerifyFailure() { | |||
| types.LogSwallowedFailure(logger, "last_commit_verify", "internal/state/validation.go:91", | |||
| block.Height, "valid LastCommit signatures", err.Error()) | |||
| } else { | |||
| return wrapped | |||
| } | |||
| } | |||
| } | |||
|
|
|||
| // NOTE: We can't actually verify it's the right proposer because we don't | |||
| // know what round the block was first proposed. So just check that it's | |||
| // a legit address and a known validator. | |||
| // a legit address and a known validator. Audit row 13 — swallow-eligible | |||
| // via SwallowProposerNotInValidatorSetFailure (forked-boot with new set). | |||
| // The length is checked in ValidateBasic above. | |||
| if !state.Validators.HasAddress(block.ProposerAddress) { | |||
| return fmt.Errorf("block.Header.ProposerAddress %X is not a validator", | |||
| err := fmt.Errorf("block.Header.ProposerAddress %X is not a validator", | |||
| block.ProposerAddress, | |||
| ) | |||
| if policy.SwallowProposerNotInValidatorSetFailure() { | |||
| types.LogSwallowedFailure(logger, "proposer_not_in_validator_set", "internal/state/validation.go:101", | |||
| block.Height, "proposer present in state.Validators", block.ProposerAddress) | |||
| } else { | |||
| return err | |||
| } | |||
| } | |||
|
|
|||
| // Validate block Time | |||
| @@ -128,9 +192,16 @@ func validateBlock(state State, block *types.Block, policy types.ConsensusPolicy | |||
| block.Height, state.InitialHeight) | |||
| } | |||
|
|
|||
| // Check evidence doesn't exceed the limit amount of bytes. | |||
| // Check evidence doesn't exceed the limit amount of bytes. Audit row 17 — | |||
| // swallow-eligible via SwallowEvidenceOverflowFailure. | |||
| if max, got := state.ConsensusParams.Evidence.MaxBytes, block.Evidence.ByteSize(); got > max { | |||
| return types.NewErrEvidenceOverflow(max, got) | |||
| err := types.NewErrEvidenceOverflow(max, got) | |||
| if policy.SwallowEvidenceOverflowFailure() { | |||
| types.LogSwallowedFailure(logger, "evidence_overflow", "internal/state/validation.go:132", | |||
| block.Height, max, got) | |||
| } else { | |||
| return err | |||
| } | |||
There was a problem hiding this comment.
Instead of doing this all in here, could we return typed validation errors and have the caller filter to say "if e in swallowedErrTypes then proceed
There was a problem hiding this comment.
Agreed — typed errors + caller filter is the more idiomatic shape. See the synthesis comment below for the rationale on shipping #3429 as-is and landing the refactor in M1.5 (a two-way door — the 13-method ConsensusPolicy interface coexists with the filter unchanged).
| // Audit row 18 — LastCommitHash (with legacy-6.4 fallback). Swallow- | ||
| // eligible via SwallowLastCommitHashFailure (forked-boot block-1). | ||
| if w, g := b.LastCommit.Hash(), b.LastCommitHash; !bytes.Equal(w, g) { | ||
| // Fall back to legacy hash calculation pre-6.4. | ||
| if wLegacy := b.LastCommit.legacyHash(); !bytes.Equal(wLegacy, g) { | ||
| return fmt.Errorf("wrong Header.LastCommitHash. Expected %X, got %X", w, g) | ||
| err := fmt.Errorf("wrong Header.LastCommitHash. Expected %X, got %X", w, g) | ||
| if policy.SwallowLastCommitHashFailure() { | ||
| LogSwallowedFailure(logger, "last_commit_hash", "types/block.go:91", | ||
| b.Height, w, g) | ||
| } else { | ||
| return err | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Audit row 2 — DataHash. Already short-circuited by | ||
| // SkipDataHashValidation (mock_block_validation); when the comparison | ||
| // runs, SwallowDataHashFailure (mock_chain_validation) governs return. | ||
| if !policy.SkipDataHashValidation() { | ||
| // NOTE: b.Data.Txs may be nil, but b.Data.Hash() still works fine. | ||
| if w, g := b.Data.Hash(false), b.DataHash; !bytes.Equal(w, g) { | ||
| return fmt.Errorf("wrong Header.DataHash. Expected %X, got %X. Len of txs %d", w, g, len(b.Txs)) | ||
| err := fmt.Errorf("wrong Header.DataHash. Expected %X, got %X. Len of txs %d", w, g, len(b.Txs)) | ||
| if policy.SwallowDataHashFailure() { | ||
| LogSwallowedFailure(logger, "data_hash", "types/block.go:100", | ||
| b.Height, w, g) | ||
| } else { | ||
| return err | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // NOTE: b.Evidence may be nil, but we're just looping. | ||
| // NOTE: b.Evidence may be nil, but we're just looping. Audit row 20 — | ||
| // per-evidence ValidateBasic; swallow-eligible via | ||
| // SwallowPerEvidenceValidateBasicFailure. | ||
| for i, ev := range b.Evidence { | ||
| if err := ev.ValidateBasic(); err != nil { | ||
| return fmt.Errorf("invalid evidence (#%d): %v", i, err) | ||
| wrapped := fmt.Errorf("invalid evidence (#%d): %v", i, err) | ||
| if policy.SwallowPerEvidenceValidateBasicFailure() { | ||
| LogSwallowedFailure(logger, "per_evidence_validate_basic", "types/block.go:106", | ||
| b.Height, fmt.Sprintf("valid evidence at index %d", i), err.Error()) | ||
| } else { | ||
| return wrapped | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Audit row 19 — EvidenceHash, swallow-eligible via | ||
| // SwallowEvidenceHashFailure. | ||
| if w, g := b.Evidence.Hash(), b.EvidenceHash; !bytes.Equal(w, g) { | ||
| return fmt.Errorf("wrong Header.EvidenceHash. Expected %X, got %X", w, g) | ||
| err := fmt.Errorf("wrong Header.EvidenceHash. Expected %X, got %X", w, g) | ||
| if policy.SwallowEvidenceHashFailure() { | ||
| LogSwallowedFailure(logger, "evidence_hash", "types/block.go:112", | ||
| b.Height, w, g) | ||
| } else { | ||
| return err | ||
| } |
There was a problem hiding this comment.
Similar question as above. How can we roll up all the normal handling that happens today into a single call location and wrap any error output from that through the swallow filtering? This prevents this code change for each location and centralizes this logic
There was a problem hiding this comment.
Same — see the synthesis comment below. Filter goes inside validateBlock (not BlockExecutor.ValidateBlock) because Block.ValidateBasic is also called from non-state callers like BlockFromProto; filtering at the lowest common ancestor catches both surfaces.
|
Engaged coral (sei-network-specialist + product-engineer as scope-cutter) to think through the two comments above. Both specialists agreed with the architectural critique: typed validation errors + a single caller-side filter is the more idiomatic shape. They also converged on the same verdict for how to land it: ship #3429 as the M1 foundation, refactor in an M1.5 follow-up PR. Why a follow-up rather than revising in placeTwo-way door. The current 13-method Cycle time. Shipping #3429 unblocks M2 (the trivial Diff is roughly a wash. ~95 lines of new typed-error infrastructure ≈ ~65 lines saved at call sites. The win is shape, not size. Concrete M1.5 PR spec (the specialists worked this out)New file:
Filter location: inside Per-site change becomes mechanical: // before
err := fmt.Errorf("wrong Block.Header.AppHash. Expected %X, got %v", state.AppHash, block.AppHash)
if policy.SwallowAppHashFailure() {
types.LogSwallowedFailure(...)
} else {
return err
}
// after
return types.NewAppHashError(block.Height, state.AppHash, block.AppHash)
Filing M1.5 as a tracked sei-chain issue alongside this comment so the follow-up doesn't get lost. Cycle: merge #3429 → land M2 (small additive variant file) → land M1.5 (style refactor on top of M2) → proceed to M3. Sound right, or would you rather we revise in place after all? |
…rrorKind) Replace the per-failure Skip*Validation surface with a single ShouldSwallow(kind ErrorKind) bool method on ConsensusPolicy, consulted by every halting check in the block-validation hot path. - types/consensus_policy.go: ErrorKind type + 13 constants (one per audit-row swallow-eligible site) and AllSwallowEligibleErrorKinds(). - consensus_policy_default.go: ShouldSwallow always false; build constraint widened to !mock_block_validation && !mock_chain_validation. - consensus_policy_mock_block_validation.go: ShouldSwallow returns true for AppHash and DataHash (preserving the tag's user-visible chain-progresses semantics), false for the other 11 kinds. Drops the Skip*Validation short-circuit — the comparison now runs and the divergence is logged before being swallowed. Tests assert the variant matrix on both build tags and pin the 13-row audit invariant. This is step A of the two-commit M1.1 rewrite for sei-chain#3427. The call-site refactor in internal/state/validation.go and types/block.go follows in the next commit and is what restores the build.
…owOrErr
Consolidate per-site swallow boilerplate into a single types.SwallowOrErr
helper and refactor every audit-row swallow-eligible site to use it.
Every check now computes its comparison authentically; the ConsensusPolicy
governs only the failure-handling decision.
- types/swallow.go: SwallowOrErr(policy, kind, logger, site, height,
expected, got, format, args...) — returns nil to swallow (logs +
increments sei_unsafe_validation_skipped_total{site, kind}) or a
formatted error to halt. LogSwallowedFailure produces the structured
divergence record; counter is auto-registered via promauto.
- internal/state/validation.go: 9 audit-row sites routed through
SwallowOrErr (AppHash, LastBlockID, ConsensusHash, LastResultsHash,
ValidatorsHash, NextValidatorsHash, LastCommitVerify,
ProposerNotInValidatorSet, EvidenceOverflow). The pre-existing
tmtypes.SkipLastResultsHashValidation atomic.Bool early-return guard
is preserved as the sole carve-out — load-bearing for Giga's
production halt-resistance; migration to a build-tagged variant is a
future Giga workstream documented inline.
- types/block.go: 4 audit-row sites routed through SwallowOrErr
(LastCommitHash, DataHash, PerEvidenceValidateBasic, EvidenceHash).
The Skip*Validation short-circuit on DataHash is gone — the
comparison now runs unconditionally; only the failure-handling
differs per ConsensusPolicy. Trades the Data.Hash(false) compute
fast-path for authentic prod-code-path execution under all build tags
(explicitly accepted in review).
Behavior preservation under mock_block_validation:
- AppHash and DataHash mismatches are still bypassed (chain continues)
but are now LOGGED in addition to being swallowed.
- Three pre-existing test failures keep failing in the same shape:
TestBlockValidateBasic (Tampered_Data, Tampered_DataHash subtests)
and TestValidateBlockHeader (DataHash subcase) — all assert an error
on tampered data and now observe the swallowed nil. TestBlockProtoBuf,
which previously failed because SkipDataHashValidation suppressed the
side-effect populate of Data.hash, now passes — a positive
consequence of the explicitly-accepted fast-path trade.
Step B of two for sei-chain#3427.
a2d076c to
1eecec7
Compare
|
You have used all Bugbot PR reviews included in your free trial for your GitHub account on this workspace. To continue using Bugbot reviews, enable Bugbot for your team in the Cursor dashboard. |
|
Force-pushed — replaces the prior shape with a cleaner architecture per the design conversation. Branch is now two commits on top of main ( What changed vs the prior shape
Why this is betterAlways-authentic code path. The user's framing: "we always want to use as close as possible to the authentic code path as we can." Every check now computes its comparison regardless of policy. Only the failure-handling differs (return err vs log+counter+continue). Brings non-prod builds closer to prod execution shape; gains observability on the existing One policy decision, not 13. Per-site authentic execution. Each check runs independently. A swallowed failure logs + counter increments + continues to the next check. The whole code path runs under Idiomatic Go. Build-tag-switched variant files (matches stdlib Carve-out preserved
Acceptance verified
Cleanup
Ready for re-review when you are. |
Comment-only sweep on the M1 PR. Cuts ~175 lines of comments that restated what the code obviously does (per-constant ErrorKind docstrings, per-site audit-row tags, duplicated build-tag matrices, parameter-restating docstrings on SwallowOrErr). Keeps the load-bearing ones: the SkipLastResultsHashValidation Giga escape-hatch carve-out, a 10-line package doc naming the build-tag variant pattern, the SwallowOrErr nil/error contract, the mock_block_validation tradeoff (AppHash+DataHash only preserves prior tag outcomes), and the audit-row count invariant test.
|
You have used all Bugbot PR reviews included in your free trial for your GitHub account on this workspace. To continue using Bugbot reviews, enable Bugbot for your team in the Cursor dashboard. |
|
Comment-density sweep added as commit What stayed: the What went: per-constant
PR is now three commits: interface collapse → call-site refactor → comment sweep. Each independently reviewable. |
| block.LastBlockID, | ||
| ) | ||
| if err := types.SwallowOrErr(policy, types.ErrorKindLastBlockID, logger, | ||
| "internal/state/validation.go:LastBlockID", block.Height, |
There was a problem hiding this comment.
Would prefer to remove "internal/state/validation.go:LastBlockID"
| if err := types.SwallowOrErr(policy, types.ErrorKindLastBlockID, logger, | ||
| "internal/state/validation.go:LastBlockID", block.Height, | ||
| state.LastBlockID, block.LastBlockID, | ||
| "wrong Block.Header.LastBlockID. Expected %v, got %v", |
There was a problem hiding this comment.
We don't need to move the logging in. If an error isn't swallowed, you can log it in this layer still. That removes the responsibility of the helper to parameterize each flavor's log message.
…kind, err) ShouldSwallow now takes the candidate error directly and returns either nil (swallow + counter increment, in the variant that opts in) or the same error unchanged (halt). Counter increments move inside the per-variant ShouldSwallow body; the site label is dropped from the metric, leaving only kind. The external SwallowOrErr / LogSwallowedFailure helpers and every per-site site/logger/expected/got argument are removed. Production behavior is unchanged: the default variant returns err for every kind. mock_block_validation outcomes are unchanged: AppHash and DataHash mismatches still don't halt, just via a slightly different code path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
|
You have used all Bugbot PR reviews included in your free trial for your GitHub account on this workspace. To continue using Bugbot reviews, enable Bugbot for your team in the Cursor dashboard. |
|
Revision per the two comments above — fourth commit Shape after revision
// types/consensus_policy_mock_block_validation.go
func (ConsensusPolicy) ShouldSwallow(kind ErrorKind, err error) error {
switch kind {
case ErrorKindAppHash, ErrorKindDataHash:
unsafeValidationSkippedTotal.WithLabelValues(string(kind)).Inc()
return nil
default:
return err
}
}Per-site: if !bytes.Equal(block.AppHash, state.AppHash) {
if err := policy.ShouldSwallow(types.ErrorKindAppHash,
fmt.Errorf("wrong Block.Header.AppHash. Expected %X, got %v",
state.AppHash, block.AppHash)); err != nil {
return err
}
}What disappeared
What stayed
Diff
Acceptance
PR now four commits, each independently reviewable. Ready for another look. |
Summary
Implements M1 of #3427 — extends
ConsensusPolicywith 13Swallow*Failure() boolmethods covering the swallow-eligible halting-check surface, then refactors those 13 call sites ininternal/state/validation.go+types/block.gofrom "return err on mismatch" to compute-log-conditionally-return.Production behavior unchanged. Default
ConsensusPolicyreturnsfalsefor everySwallow*Failure, so every site takes the originalreturn errpath. The new code path is dormant until M2 lands themock_chain_validationvariant that flipsSwallow*Failuretotrue.Two reviewable commits
Per-package work
sei-tendermint/types/consensus_policy.go— central doc and method matrix; per-receiver method decls live in the variant files (Go forbids same-receiver redeclaration across files in one package).sei-tendermint/types/consensus_policy_default.go— build tag widened from!mock_block_validationto!mock_block_validation && !mock_chain_validation(required so M2's variant file can land without duplicate-declaration compile failure). All 13Swallow*Failuremethods returnfalse.sei-tendermint/types/consensus_policy_mock_block_validation.go— preserves existing tag semantics:Skip*methods still returntrue(short-circuit before computing), all 13 newSwallow*Failuremethods returnfalse(no compute-and-swallow behavior added to this tag).sei-tendermint/types/swallow.go(new) — package-level PrometheusCounterVecforsei_unsafe_validation_skipped_total{kind}via promauto, plusLogSwallowedFailure()helper emitting structured slog Error withmock_chain_validation/kind/site/height/expected/gotfields. Direct Prometheus usage matches the precedent ininternal/autobahn/data/metrics.go.sei-tendermint/internal/state/validation.go— 9 sites refactored (audit rows 1, 6, 7, 8, 9, 10, 12, 13, 17). Row 8 (LastResultsHash) preserves theatomic.BoolGiga short-circuit in the outer condition; comparison only runs when atomic is false, and the Swallow path is consulted only inside that branch. Giga's fast-path semantics are preserved;mock_chain_validationparticipates via the policy interface instead of the atomic.sei-tendermint/types/block.go— 4 sites refactored (audit rows 2, 18, 19, 20).sei-tendermint/types/consensus_policy_test.go+consensus_policy_mock_block_validation_test.go— variant-matrix tests covering all 15 methods (2Skip*+ 13Swallow*).What's NOT in this PR
consensus_policy_mock_chain_validation.govariant file (returningtruefor everySwallow*Failure). That's M2 (next PR).Swallow*Failure() == trueend-to-end. Also M2 — the variant has to exist before those tests are runnable.Pre-existing test failures (verified as baseline)
Reproduce on the M1 step-A baseline commit before my changes:
-tags mock_block_validationruns:TestBlockValidateBasic,TestBlockProtoBuf(types),TestValidateBlockHeader(internal/state) all fail because the existing tag intentionally nilsBlock.Hash()and those tests don't account for it.TestNodeRestartEventAllowsRecreateis a TCP-port flake (bind: address already in use).None introduced by this PR.
Test plan
go build ./sei-tendermint/...passesgo build -tags mock_block_validation ./sei-tendermint/...passesgo test ./sei-tendermint/...passes (modulo the TCP flake)go test -tags mock_block_validation ./sei-tendermint/...passes (modulo the three pre-existing failures)ConsensusPolicymethods in both the default build and-tags mock_block_validationSwallow*Failure()naming and method matrixLastResultsHashatomic.Bool ↔ policy-interface coexistence (Giga's existing flip-on semantics preserved)swallow.go's package-level promauto.NewCounterVec (vs go-kit metrics elsewhere — direct Prom precedent cited)References
mock_chain_validationbuild tag (compute-and-swallow consensus + module-genesis failures) #3427🤖 Generated with Claude Code