From 404188d9ecc6a3132874edb65145b5ffa43c4960 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 17 Jul 2023 17:16:42 +0200 Subject: [PATCH 1/2] DEPS: bump raft Picks up - https://github.com/etcd-io/raft/pull/77 - https://github.com/etcd-io/raft/pull/79 (essentially an off-by-default for 77) - https://github.com/etcd-io/raft/pull/82 - https://github.com/etcd-io/raft/pull/81 (needed for #105797) Epic: none Release note: None --- DEPS.bzl | 6 +++--- build/bazelutil/distdir_files.bzl | 2 +- go.mod | 2 +- go.sum | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/DEPS.bzl b/DEPS.bzl index 3f72b11970ac..ae6e71cb89ad 100644 --- a/DEPS.bzl +++ b/DEPS.bzl @@ -10577,10 +10577,10 @@ def go_deps(): ], build_file_proto_mode = "default", importpath = "go.etcd.io/raft/v3", - sha256 = "7540ce70b9c79987eb5a09f693302931ac88e34757397451c5d44c202649adb3", - strip_prefix = "go.etcd.io/raft/v3@v3.0.0-20230626154957-a10cd4571633", + sha256 = "292954320a69953ac8366cd7a83e4a50b1cfd858643a4cdf9947ab9607f1a142", + strip_prefix = "go.etcd.io/raft/v3@v3.0.0-20230717153924-72a6e6c9f3ee", urls = [ - "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/raft/v3/io_etcd_go_raft_v3-v3.0.0-20230626154957-a10cd4571633.zip", + "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/raft/v3/io_etcd_go_raft_v3-v3.0.0-20230717153924-72a6e6c9f3ee.zip", ], ) go_repository( diff --git a/build/bazelutil/distdir_files.bzl b/build/bazelutil/distdir_files.bzl index c38c01c09a70..f5d9446ab8db 100644 --- a/build/bazelutil/distdir_files.bzl +++ b/build/bazelutil/distdir_files.bzl @@ -1023,7 +1023,7 @@ DISTDIR_FILES = { "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/etcd/client/pkg/v3/io_etcd_go_etcd_client_pkg_v3-v3.5.0.zip": "c0ca209767c5734c6ed023888ba5be02aab5bd3c4d018999467f2bfa8bf65ee3", "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/etcd/client/v2/io_etcd_go_etcd_client_v2-v2.305.0.zip": "91fcb507fe8c193844b56bfb6c8741aaeb6ffa11ee9043de2af0f141173679f3", "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/etcd/io_etcd_go_etcd-v0.5.0-alpha.5.0.20200910180754-dd1b699fc489.zip": "d982ee501979b41b68625693bad77d15e4ae79ab9d0eae5f6028205f96a74e49", - "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/raft/v3/io_etcd_go_raft_v3-v3.0.0-20230626154957-a10cd4571633.zip": "7540ce70b9c79987eb5a09f693302931ac88e34757397451c5d44c202649adb3", + "https://storage.googleapis.com/cockroach-godeps/gomod/go.etcd.io/raft/v3/io_etcd_go_raft_v3-v3.0.0-20230717153924-72a6e6c9f3ee.zip": "292954320a69953ac8366cd7a83e4a50b1cfd858643a4cdf9947ab9607f1a142", "https://storage.googleapis.com/cockroach-godeps/gomod/go.mongodb.org/mongo-driver/org_mongodb_go_mongo_driver-v1.5.1.zip": "446cff132e82c64af7ffcf48e268eb16ec81f694914aa6baecb06cbbae1be0d7", "https://storage.googleapis.com/cockroach-godeps/gomod/go.mozilla.org/pkcs7/org_mozilla_go_pkcs7-v0.0.0-20200128120323-432b2356ecb1.zip": "3c4c1667907ff3127e371d44696326bad9e965216d4257917ae28e8b82a9e08d", "https://storage.googleapis.com/cockroach-godeps/gomod/go.opencensus.io/io_opencensus_go-v0.24.0.zip": "203a767d7f8e7c1ebe5588220ad168d1e15b14ae70a636de7ca9a4a88a7e0d0c", diff --git a/go.mod b/go.mod index 2e2be1eac64a..c79e508d47cf 100644 --- a/go.mod +++ b/go.mod @@ -219,7 +219,7 @@ require ( github.com/xdg-go/scram v1.1.2 github.com/xdg-go/stringprep v1.0.4 github.com/zabawaba99/go-gitignore v0.0.0-20200117185801-39e6bddfb292 - go.etcd.io/raft/v3 v3.0.0-20230626154957-a10cd4571633 + go.etcd.io/raft/v3 v3.0.0-20230717153924-72a6e6c9f3ee go.opentelemetry.io/otel v1.0.0-RC3 go.opentelemetry.io/otel/exporters/jaeger v1.0.0-RC3 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.0.0-RC3 diff --git a/go.sum b/go.sum index 2c7aac429701..2bf3a8936b57 100644 --- a/go.sum +++ b/go.sum @@ -2320,8 +2320,8 @@ go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489/go.mod h1:yVHk9ub3C go.etcd.io/etcd/api/v3 v3.5.0/go.mod h1:cbVKeC6lCfl7j/8jBhAK6aIYO9XOjdptoxU/nLQcPvs= go.etcd.io/etcd/client/pkg/v3 v3.5.0/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g= go.etcd.io/etcd/client/v2 v2.305.0/go.mod h1:h9puh54ZTgAKtEbut2oe9P4L/oqKCVB6xsXlzd7alYQ= -go.etcd.io/raft/v3 v3.0.0-20230626154957-a10cd4571633 h1:C6cmDqdkeGrJ6fT7OFCbnPauTEJvFY+HZSWk2N0PUvI= -go.etcd.io/raft/v3 v3.0.0-20230626154957-a10cd4571633/go.mod h1:tP6U+sRzrl75ltgmFcdZg9reZVEyM3vKTxAWmwpHtB8= +go.etcd.io/raft/v3 v3.0.0-20230717153924-72a6e6c9f3ee h1:7KfWqaO+jirXdEFWpwK1fa2NIm1nmBf3fP0AH6NydCk= +go.etcd.io/raft/v3 v3.0.0-20230717153924-72a6e6c9f3ee/go.mod h1:tP6U+sRzrl75ltgmFcdZg9reZVEyM3vKTxAWmwpHtB8= go.mongodb.org/mongo-driver v1.0.3/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.1/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.2/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= From 4585dde57d68a161b2331ec11abce55d4d2682a8 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 11 Jul 2023 10:02:14 +0200 Subject: [PATCH 2/2] kvserver: prevent etcd-io/raft from dropping conf changes Relies on https://github.com/etcd-io/raft/pull/81. We don't want it to, since that causes issues due to false positives. We are taking responsibility for carrying out only valid conf changes, as we always have. See also https://github.com/etcd-io/raft/issues/80. Fixes #105797. Epic: CRDB-25287 Release note (bug fix): under rare circumstances, a replication change could get stuck when proposed near lease/leadership changes (and likely under overload), and the replica circuit breakers could trip. This problem has been addressed. Note to editors: this time it's really addressed (fingers crossed); a previous attempt with an identical release note had to be reverted. --- pkg/kv/kvserver/client_raft_test.go | 63 +++++++++++++++++++++++++++++ pkg/kv/kvserver/store.go | 25 ++++++------ 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 00409740a6f1..4dce434da100 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -32,6 +32,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvflowcontrol/kvflowdispatch" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" @@ -7030,3 +7031,65 @@ func TestStoreMetricsOnIncomingOutgoingMsg(t *testing.T) { require.Equal(t, expected, actual) }) } + +// TestInvalidConfChangeRejection is a regression test for [1]. It proposes +// an (intentionally) invalid configuration change and makes sure that raft +// does not drop it. +// +// [1]: https://github.com/cockroachdb/cockroach/issues/105797 +func TestInvalidConfChangeRejection(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // This is a regression test against a stuck command, so set a timeout to get + // a shot at a graceful failure on regression. + ctx, cancel := context.WithTimeout(context.Background(), testutils.DefaultSucceedsSoonDuration) + defer cancel() + + // When our configuration change shows up below raft, we need to apply it as a + // no-op, since the config change is intentionally invalid and assertions + // would fail if we were to try to actually apply it. + injErr := errors.New("injected error") + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ + TestingApplyCalledTwiceFilter: func(args kvserverbase.ApplyFilterArgs) (int, *kvpb.Error) { + if args.Req != nil && args.Req.Txn != nil && args.Req.Txn.Name == "fake" { + return 0, kvpb.NewError(injErr) + } + return 0, nil + }}}}, + }) + defer tc.Stopper().Stop(ctx) + + k := tc.ScratchRange(t) + + repl := tc.GetFirstStoreFromServer(t, 0).LookupReplica(keys.MustAddr(k)) + + // Try to leave a joint config even though we're not in one. This is something + // that will lead raft to propose an empty entry instead of our conf change. + // + // See: https://github.com/cockroachdb/cockroach/issues/105797 + var ba kvpb.BatchRequest + now := tc.Server(0).Clock().Now() + txn := roachpb.MakeTransaction("fake", k, isolation.Serializable, roachpb.NormalUserPriority, now, 500*time.Millisecond.Nanoseconds(), 1) + ba.Txn = &txn + ba.Timestamp = now + ba.Add(&kvpb.EndTxnRequest{ + RequestHeader: kvpb.RequestHeader{ + Key: k, + }, + Commit: true, + InternalCommitTrigger: &roachpb.InternalCommitTrigger{ + ChangeReplicasTrigger: &roachpb.ChangeReplicasTrigger{ + Desc: repl.Desc(), + }, + }, + }) + + _, pErr := repl.Send(ctx, &ba) + // Verify that we see the configuration change below raft, where we rejected it + // (since it would've otherwise blow up the Replica: after all, we intentionally + // proposed an invalid configuration change. + require.True(t, errors.Is(pErr.GoError(), injErr), "%+v", pErr.GoError()) +} diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 996def487bca..41aefb1f7a52 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -305,18 +305,19 @@ func newRaftConfig( logger raft.Logger, ) *raft.Config { return &raft.Config{ - ID: id, - Applied: uint64(appliedIndex), - AsyncStorageWrites: true, - ElectionTick: storeCfg.RaftElectionTimeoutTicks, - HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, - MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, - MaxCommittedSizePerReady: storeCfg.RaftMaxCommittedSizePerReady, - MaxSizePerMsg: storeCfg.RaftMaxSizePerMsg, - MaxInflightMsgs: storeCfg.RaftMaxInflightMsgs, - MaxInflightBytes: storeCfg.RaftMaxInflightBytes, - Storage: strg, - Logger: logger, + ID: id, + Applied: uint64(appliedIndex), + AsyncStorageWrites: true, + ElectionTick: storeCfg.RaftElectionTimeoutTicks, + HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, + MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, + MaxCommittedSizePerReady: storeCfg.RaftMaxCommittedSizePerReady, + DisableConfChangeValidation: true, // see https://github.com/cockroachdb/cockroach/issues/105797 + MaxSizePerMsg: storeCfg.RaftMaxSizePerMsg, + MaxInflightMsgs: storeCfg.RaftMaxInflightMsgs, + MaxInflightBytes: storeCfg.RaftMaxInflightBytes, + Storage: strg, + Logger: logger, PreVote: true, CheckQuorum: storeCfg.RaftEnableCheckQuorum,