From 107d60fdd220d9578d01f179beb3c226e3803f52 Mon Sep 17 00:00:00 2001 From: Allen Ray Date: Wed, 7 Jun 2023 13:03:38 -0400 Subject: [PATCH] Adding optional revision bump and mark compacted to snapshot restore Signed-off-by: Allen Ray --- etcdctl/ctlv3/command/snapshot_command.go | 6 +- etcdutl/etcdutl/snapshot_command.go | 15 ++- etcdutl/snapshot/util.go | 22 ++++ etcdutl/snapshot/v3_snapshot.go | 83 +++++++++++++ server/mvcc/util.go | 6 + tests/e2e/ctl_v3_snapshot_test.go | 141 ++++++++++++++++++++++ tests/e2e/utils.go | 24 ++++ 7 files changed, 295 insertions(+), 2 deletions(-) diff --git a/etcdctl/ctlv3/command/snapshot_command.go b/etcdctl/ctlv3/command/snapshot_command.go index a11340f8e66a..791633dc220c 100644 --- a/etcdctl/ctlv3/command/snapshot_command.go +++ b/etcdctl/ctlv3/command/snapshot_command.go @@ -40,6 +40,8 @@ var ( restorePeerURLs string restoreName string skipHashCheck bool + markCompacted bool + revisionBump uint64 ) // NewSnapshotCommand returns the cobra command for "snapshot". @@ -89,6 +91,8 @@ func NewSnapshotRestoreCommand() *cobra.Command { cmd.Flags().StringVar(&restorePeerURLs, "initial-advertise-peer-urls", defaultInitialAdvertisePeerURLs, "List of this member's peer URLs to advertise to the rest of the cluster") cmd.Flags().StringVar(&restoreName, "name", defaultName, "Human-readable name for this member") cmd.Flags().BoolVar(&skipHashCheck, "skip-hash-check", false, "Ignore snapshot integrity hash value (required if copied from data directory)") + cmd.Flags().Uint64Var(&revisionBump, "bump-revision", 0, "How much to increase the latest revision after restore (required if --mark-compacted)") + cmd.Flags().BoolVar(&markCompacted, "mark-compacted", false, "Mark the latest revision after restore as the point of scheduled compaction (required if --bump-revision > 0)") return cmd } @@ -127,7 +131,7 @@ func snapshotStatusCommandFunc(cmd *cobra.Command, args []string) { func snapshotRestoreCommandFunc(cmd *cobra.Command, args []string) { fmt.Fprintf(os.Stderr, "Deprecated: Use `etcdutl snapshot restore` instead.\n\n") etcdutl.SnapshotRestoreCommandFunc(restoreCluster, restoreClusterToken, restoreDataDir, restoreWalDir, - restorePeerURLs, restoreName, skipHashCheck, args) + restorePeerURLs, restoreName, skipHashCheck, revisionBump, markCompacted, args) } func initialClusterFromName(name string) string { diff --git a/etcdutl/etcdutl/snapshot_command.go b/etcdutl/etcdutl/snapshot_command.go index 94ab2a5ac927..f7cf09c6a561 100644 --- a/etcdutl/etcdutl/snapshot_command.go +++ b/etcdutl/etcdutl/snapshot_command.go @@ -38,6 +38,8 @@ var ( restorePeerURLs string restoreName string skipHashCheck bool + markCompacted bool + revisionBump uint64 ) // NewSnapshotCommand returns the cobra command for "snapshot". @@ -91,6 +93,8 @@ func NewSnapshotRestoreCommand() *cobra.Command { cmd.Flags().StringVar(&restorePeerURLs, "initial-advertise-peer-urls", defaultInitialAdvertisePeerURLs, "List of this member's peer URLs to advertise to the rest of the cluster") cmd.Flags().StringVar(&restoreName, "name", defaultName, "Human-readable name for this member") cmd.Flags().BoolVar(&skipHashCheck, "skip-hash-check", false, "Ignore snapshot integrity hash value (required if copied from data directory)") + cmd.Flags().Uint64Var(&revisionBump, "bump-revision", 0, "How much to increase the latest revision after restore (required if --mark-compacted)") + cmd.Flags().BoolVar(&markCompacted, "mark-compacted", false, "Mark the latest revision after restore as the point of scheduled compaction (required if --bump-revision > 0)") cmd.MarkFlagRequired("data-dir") @@ -115,7 +119,7 @@ func SnapshotStatusCommandFunc(cmd *cobra.Command, args []string) { func snapshotRestoreCommandFunc(_ *cobra.Command, args []string) { SnapshotRestoreCommandFunc(restoreCluster, restoreClusterToken, restoreDataDir, restoreWalDir, - restorePeerURLs, restoreName, skipHashCheck, args) + restorePeerURLs, restoreName, skipHashCheck, revisionBump, markCompacted, args) } func SnapshotRestoreCommandFunc(restoreCluster string, @@ -125,12 +129,19 @@ func SnapshotRestoreCommandFunc(restoreCluster string, restorePeerURLs string, restoreName string, skipHashCheck bool, + revisionBump uint64, + markCompacted bool, args []string) { if len(args) != 1 { err := fmt.Errorf("snapshot restore requires exactly one argument") cobrautl.ExitWithError(cobrautl.ExitBadArgs, err) } + if (revisionBump == 0 && markCompacted) || (revisionBump > 0 && !markCompacted) { + err := fmt.Errorf("--mark-compacted required if --revision-bump > 0") + cobrautl.ExitWithError(cobrautl.ExitBadArgs, err) + } + dataDir := restoreDataDir if dataDir == "" { dataDir = restoreName + ".etcd" @@ -153,6 +164,8 @@ func SnapshotRestoreCommandFunc(restoreCluster string, InitialCluster: restoreCluster, InitialClusterToken: restoreClusterToken, SkipHashCheck: skipHashCheck, + RevisionBump: revisionBump, + MarkCompacted: markCompacted, }); err != nil { cobrautl.ExitWithError(cobrautl.ExitError, err) } diff --git a/etcdutl/snapshot/util.go b/etcdutl/snapshot/util.go index 2c1fae21fa15..a4f3569c6884 100644 --- a/etcdutl/snapshot/util.go +++ b/etcdutl/snapshot/util.go @@ -23,9 +23,31 @@ type revision struct { sub int64 } +// GreaterThan should be synced with function in server +// https://github.com/etcd-io/etcd/blob/main/server/storage/mvcc/revision.go +func (a revision) GreaterThan(b revision) bool { + if a.main > b.main { + return true + } + if a.main < b.main { + return false + } + return a.sub > b.sub +} + +// bytesToRev should be synced with function in server +// https://github.com/etcd-io/etcd/blob/main/server/storage/mvcc/revision.go func bytesToRev(bytes []byte) revision { return revision{ main: int64(binary.BigEndian.Uint64(bytes[0:8])), sub: int64(binary.BigEndian.Uint64(bytes[9:])), } } + +// revToBytes should be synced with function in server +// https://github.com/etcd-io/etcd/blob/main/server/storage/mvcc/revision.go +func revToBytes(bytes []byte, rev revision) { + binary.BigEndian.PutUint64(bytes[0:8], uint64(rev.main)) + bytes[8] = '_' + binary.BigEndian.PutUint64(bytes[9:], uint64(rev.sub)) +} diff --git a/etcdutl/snapshot/v3_snapshot.go b/etcdutl/snapshot/v3_snapshot.go index 7961977f18a7..94cec83ab4b1 100644 --- a/etcdutl/snapshot/v3_snapshot.go +++ b/etcdutl/snapshot/v3_snapshot.go @@ -40,7 +40,9 @@ import ( "go.etcd.io/etcd/server/v3/etcdserver/api/snap" "go.etcd.io/etcd/server/v3/etcdserver/api/v2store" "go.etcd.io/etcd/server/v3/etcdserver/cindex" + "go.etcd.io/etcd/server/v3/mvcc" "go.etcd.io/etcd/server/v3/mvcc/backend" + "go.etcd.io/etcd/server/v3/mvcc/buckets" "go.etcd.io/etcd/server/v3/verify" "go.etcd.io/etcd/server/v3/wal" "go.etcd.io/etcd/server/v3/wal/walpb" @@ -194,6 +196,16 @@ type RestoreConfig struct { // SkipHashCheck is "true" to ignore snapshot integrity hash value // (required if copied from data directory). SkipHashCheck bool + + // RevisionBump is the amount to increase the latest revision after restore, + // to allow administrators to trick clients into thinking that revision never decreased. + // If 0, revision bumping is skipped. + // (required if MarkCompacted == true) + RevisionBump uint64 + + // MarkCompacted is "true" to mark the latest revision as compacted. + // (required if RevisionBump > 0) + MarkCompacted bool } // Restore restores a new etcd data directory from given snapshot file. @@ -257,6 +269,13 @@ func (s *v3Manager) Restore(cfg RestoreConfig) error { if err = s.saveDB(); err != nil { return err } + + if cfg.MarkCompacted && cfg.RevisionBump > 0 { + if err = s.modifyLatestRevision(cfg.RevisionBump); err != nil { + return err + } + } + hardstate, err := s.saveWALAndSnap() if err != nil { return err @@ -303,6 +322,70 @@ func (s *v3Manager) saveDB() error { return nil } +// modifyLatestRevision can increase the latest revision by the given amount and sets the scheduled compaction +// to that revision so that the server will consider this revision compacted. +func (s *v3Manager) modifyLatestRevision(bumpAmount uint64) error { + be := backend.NewDefaultBackend(s.outDbPath()) + defer func() { + be.ForceCommit() + be.Close() + }() + + tx := be.BatchTx() + tx.LockOutsideApply() + defer tx.Unlock() + + latest, err := s.unsafeGetLatestRevision(tx) + if err != nil { + return err + } + + latest = s.unsafeBumpRevision(tx, latest, int64(bumpAmount)) + s.unsafeMarkRevisionCompacted(tx, latest) + + return nil +} + +func (s *v3Manager) unsafeBumpRevision(tx backend.BatchTx, latest revision, amount int64) revision { + s.lg.Info( + "bumping latest revision", + zap.Int64("latest-revision", latest.main), + zap.Int64("bump-amount", amount), + zap.Int64("new-latest-revision", latest.main+amount), + ) + + latest.main += amount + latest.sub = 0 + k := make([]byte, 17) + revToBytes(k, latest) + tx.UnsafePut(buckets.Key, k, []byte{}) + + return latest +} + +func (s *v3Manager) unsafeMarkRevisionCompacted(tx backend.BatchTx, latest revision) { + s.lg.Info( + "marking revision compacted", + zap.Int64("revision", latest.main), + ) + + mvcc.UnsafeSetScheduledCompact(tx, latest.main) +} + +func (s *v3Manager) unsafeGetLatestRevision(tx backend.BatchTx) (revision, error) { + var latest revision + err := tx.UnsafeForEach(buckets.Key, func(k, _ []byte) (err error) { + rev := bytesToRev(k) + + if rev.GreaterThan(latest) { + latest = rev + } + + return nil + }) + return latest, err +} + func (s *v3Manager) copyAndVerifyDB() error { srcf, ferr := os.Open(s.srcDbPath) if ferr != nil { diff --git a/server/mvcc/util.go b/server/mvcc/util.go index c4c0ff2f013e..db79bffa653a 100644 --- a/server/mvcc/util.go +++ b/server/mvcc/util.go @@ -35,3 +35,9 @@ func WriteKV(be backend.Backend, kv mvccpb.KeyValue) { be.BatchTx().UnsafePut(buckets.Key, ibytes, d) be.BatchTx().Unlock() } + +func UnsafeSetScheduledCompact(tx backend.BatchTx, value int64) { + rbytes := newRevBytes() + revToBytes(revision{main: value}, rbytes) + tx.UnsafePut(buckets.Meta, scheduledCompactKeyName, rbytes) +} diff --git a/tests/e2e/ctl_v3_snapshot_test.go b/tests/e2e/ctl_v3_snapshot_test.go index e0fe86fe964b..f682f3cac89d 100644 --- a/tests/e2e/ctl_v3_snapshot_test.go +++ b/tests/e2e/ctl_v3_snapshot_test.go @@ -15,6 +15,7 @@ package e2e import ( + "context" "encoding/json" "fmt" "io" @@ -24,6 +25,9 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" + v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" + clientv3 "go.etcd.io/etcd/client/v3" "go.etcd.io/etcd/etcdutl/v3/snapshot" "go.etcd.io/etcd/pkg/v3/expect" ) @@ -290,3 +294,140 @@ func testIssue6361(t *testing.T, etcdutl bool) { } t.Log("Test logic done") } + +func TestRestoreCompactionRevBump(t *testing.T) { + BeforeTest(t) + + epc, err := newEtcdProcessCluster(t, &etcdProcessClusterConfig{ + clusterSize: 1, + initialToken: "new", + keepDataDir: true, + }) + if err != nil { + t.Fatalf("could not start etcd process cluster (%v)", err) + } + defer func() { + if errC := epc.Close(); errC != nil { + t.Fatalf("error closing etcd processes (%v)", errC) + } + }() + + dialTimeout := 10 * time.Second + prefixArgs := []string{ctlBinPath, "--endpoints", strings.Join(epc.EndpointsV3(), ","), "--dial-timeout", dialTimeout.String()} + + ctl := newClient(t, epc.EndpointsV3(), epc.cfg.clientTLS, epc.cfg.isClientAutoTLS) + watchCh := ctl.Watch(context.Background(), "foo", clientv3.WithPrefix()) + // flake-fix: the watch can sometimes miss the first put below causing test failure + time.Sleep(100 * time.Millisecond) + + kvs := []kv{{"foo1", "val1"}, {"foo2", "val2"}, {"foo3", "val3"}} + for i := range kvs { + if err = spawnWithExpect(append(prefixArgs, "put", kvs[i].key, kvs[i].val), "OK"); err != nil { + t.Fatal(err) + } + } + + watchTimeout := 1 * time.Second + watchRes, err := keyValuesFromWatchChan(watchCh, len(kvs), watchTimeout) + require.NoErrorf(t, err, "failed to get key-values from watch channel %s", err) + require.Equal(t, kvs, watchRes) + + // ensure we get the right revision back for each of the keys + currentRev := 4 + baseRev := 2 + hasKVs(t, ctl, kvs, currentRev, baseRev) + + fpath := filepath.Join(t.TempDir(), "test.snapshot") + + t.Log("etcdctl saving snapshot...") + require.NoError(t, spawnWithExpect(append(prefixArgs, "snapshot", "save", fpath), fmt.Sprintf("Snapshot saved at %s", fpath))) + + // add some more kvs that are not in the snapshot that will be lost after restore + unsnappedKVs := []kv{{"unsnapped1", "one"}, {"unsnapped2", "two"}, {"unsnapped3", "three"}} + for i := range unsnappedKVs { + if err = spawnWithExpect(append(prefixArgs, "put", unsnappedKVs[i].key, unsnappedKVs[i].val), "OK"); err != nil { + t.Fatal(err) + } + } + + t.Log("Stopping the original server...") + require.NoError(t, epc.Stop()) + + newDataDir := filepath.Join(t.TempDir(), "test.data") + t.Log("etcdctl restoring the snapshot...") + bumpAmount := 10000 + err = spawnWithExpect([]string{ + utlBinPath, + "snapshot", + "restore", fpath, + "--name", epc.procs[0].Config().name, + "--initial-cluster", epc.procs[0].Config().initialCluster, + "--initial-cluster-token", epc.procs[0].Config().initialToken, + "--initial-advertise-peer-urls", epc.procs[0].Config().purl.String(), + "--bump-revision", fmt.Sprintf("%d", bumpAmount), + "--mark-compacted", + "--data-dir", newDataDir, + }, "added member") + require.NoError(t, err) + + t.Log("(Re)starting the etcd member using the restored snapshot...") + epc.procs[0].Config().dataDirPath = newDataDir + for i := range epc.procs[0].Config().args { + if epc.procs[0].Config().args[i] == "--data-dir" { + epc.procs[0].Config().args[i+1] = newDataDir + } + } + + require.NoError(t, epc.Restart()) + + t.Log("Ensuring the restored member has the correct data...") + hasKVs(t, ctl, kvs, currentRev, baseRev) + + for i := range unsnappedKVs { + v, err := ctl.Get(context.Background(), unsnappedKVs[i].key) + require.NoError(t, err) + require.Equal(t, int64(0), v.Count) + } + + cancelResult, ok := <-watchCh + require.True(t, ok, "watchChannel should be open") + require.Equal(t, v3rpc.ErrCompacted, cancelResult.Err()) + require.Truef(t, cancelResult.Canceled, "expected ongoing watch to be cancelled after restoring with --mark-compacted") + require.Equal(t, int64(bumpAmount+currentRev), cancelResult.CompactRevision) + _, ok = <-watchCh + require.False(t, ok, "watchChannel should be closed after restoring with --mark-compacted") + + // clients might restart the watch at the old base revision, that should not yield any new data + // everything up until bumpAmount+currentRev should return "already compacted" + for i := bumpAmount - 2; i < bumpAmount+currentRev; i++ { + watchCh = ctl.Watch(context.Background(), "foo", clientv3.WithPrefix(), clientv3.WithRev(int64(i))) + cancelResult := <-watchCh + require.Equal(t, v3rpc.ErrCompacted, cancelResult.Err()) + require.Truef(t, cancelResult.Canceled, "expected ongoing watch to be cancelled after restoring with --mark-compacted") + require.Equal(t, int64(bumpAmount+currentRev), cancelResult.CompactRevision) + } + + // a watch after that revision should yield successful results when a new put arrives + ctx, cancel := context.WithTimeout(context.Background(), watchTimeout*5) + defer cancel() + watchCh = ctl.Watch(ctx, "foo", clientv3.WithPrefix(), clientv3.WithRev(int64(bumpAmount+currentRev+1))) + if err = spawnWithExpect(append(prefixArgs, "put", "foo4", "val4"), "OK"); err != nil { + t.Fatal(err) + } + watchRes, err = keyValuesFromWatchChan(watchCh, 1, watchTimeout) + require.NoErrorf(t, err, "failed to get key-values from watch channel %s", err) + require.Equal(t, []kv{{"foo4", "val4"}}, watchRes) + +} + +func hasKVs(t *testing.T, ctl *clientv3.Client, kvs []kv, currentRev int, baseRev int) { + for i := range kvs { + v, err := ctl.Get(context.Background(), kvs[i].key) + require.NoError(t, err) + require.Equal(t, int64(1), v.Count) + require.Equal(t, kvs[i].val, string(v.Kvs[0].Value)) + require.Equal(t, int64(baseRev+i), v.Kvs[0].CreateRevision) + require.Equal(t, int64(baseRev+i), v.Kvs[0].ModRevision) + require.Equal(t, int64(1), v.Kvs[0].Version) + } +} diff --git a/tests/e2e/utils.go b/tests/e2e/utils.go index d05b3ad46419..fc504c72fdfe 100644 --- a/tests/e2e/utils.go +++ b/tests/e2e/utils.go @@ -16,6 +16,7 @@ package e2e import ( "context" + "errors" "fmt" "testing" "time" @@ -121,3 +122,26 @@ func fillEtcdWithData(ctx context.Context, c *clientv3.Client, dbSize int) error } return g.Wait() } + +func keyValuesFromWatchResponse(resp clientv3.WatchResponse) (kvs []kv) { + for _, event := range resp.Events { + kvs = append(kvs, kv{string(event.Kv.Key), string(event.Kv.Value)}) + } + return kvs +} + +func keyValuesFromWatchChan(wch clientv3.WatchChan, wantedLen int, timeout time.Duration) (kvs []kv, err error) { + for { + select { + case watchResp, ok := <-wch: + if ok { + kvs = append(kvs, keyValuesFromWatchResponse(watchResp)...) + if len(kvs) == wantedLen { + return kvs, nil + } + } + case <-time.After(timeout): + return nil, errors.New("closed watcher channel should not block") + } + } +}