Skip to content

Commit

Permalink
chore: support debug shell for advanced development
Browse files Browse the repository at this point in the history
Support dropping into a very minimal debug shell.

```bash
sudo -E --preserve-env=HOME _out/talosctl-linux-amd64 cluster create --provisioner=qemu $REGISTRY_MIRROR_FLAGS --controlplanes=1 --workers=0 --with-bootloader=false --with-debug-shell
```

Signed-off-by: Noel Georgi <[email protected]>
  • Loading branch information
frezbo authored and dsseng committed Oct 5, 2024
1 parent fd071b8 commit 70da315
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 81 deletions.
17 changes: 17 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ ARG PKG_CNI
ARG PKG_FLANNEL_CNI
ARG PKG_TALOSCTL_CNI_BUNDLE_INSTALL

ARG DEBUG_TOOLS_SOURCE

# Resolve package images using ${PKGS} to be used later in COPY --from=.

FROM ${PKG_FHS} AS pkg-fhs
Expand Down Expand Up @@ -140,6 +142,17 @@ FROM ${PKG_KERNEL} AS pkg-kernel
FROM --platform=amd64 ${PKG_KERNEL} AS pkg-kernel-amd64
FROM --platform=arm64 ${PKG_KERNEL} AS pkg-kernel-arm64

FROM --platform=amd64 ${TOOLS} as tools-amd64
FROM --platform=arm64 ${TOOLS} as tools-arm64

FROM scratch as pkg-debug-tools-amd64
COPY --from=tools-amd64 /toolchain/bin/bash /toolchain/bin/bash
COPY --from=tools-amd64 /toolchain/lib/ld-musl-x86_64.so.1 /toolchain/toolchain/lib/ld-musl-x86_64.so.1

FROM scratch as pkg-debug-tools-arm64
COPY --from=tools-arm64 /toolchain/bin/bash /bin/bash
COPY --from=tools-arm64 /toolchain/lib/ld-musl-aarch64.so.1 /toolchain/lib/ld-musl-aarch64.so.1

# Strip CNI package.

FROM scratch AS pkg-cni-stripped-amd64
Expand Down Expand Up @@ -658,6 +671,10 @@ COPY --link --from=pkg-kmod-amd64 /usr/lib/libkmod.* /rootfs/lib/
COPY --link --from=pkg-kmod-amd64 /usr/bin/kmod /rootfs/sbin/modprobe
COPY --link --from=modules-amd64 /lib/modules /rootfs/lib/modules
COPY --link --from=machined-build-amd64 /machined /rootfs/sbin/init

# this is a no-op as it copies from a scratch image when WITH_DEBUG_SHELL is not set
COPY --link --from=pkg-debug-tools-amd64 * /rootfs/

RUN <<END
# the orderly_poweroff call by the kernel will call '/sbin/poweroff'
ln /rootfs/sbin/init /rootfs/sbin/poweroff
Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ CI_RELEASE_TAG := $(shell git log --oneline --format=%B -n 1 HEAD^2 -- 2>/dev/nu
ARTIFACTS := _out
TOOLS ?= ghcr.io/siderolabs/tools:v1.9.0-alpha.0-3-g1151610

DEBUG_TOOLS_SOURCE := scratch

PKGS_PREFIX ?= ghcr.io/siderolabs
PKGS ?= v1.9.0-alpha.0-18-gba0341e
EXTRAS ?= v1.9.0-alpha.0
Expand Down Expand Up @@ -147,6 +149,12 @@ GO_LDFLAGS += -s -w
endif

GO_BUILDFLAGS_TALOSCTL := $(GO_BUILDFLAGS) -tags "$(GO_BUILDTAGS_TALOSCTL)"

ifneq (, $(filter $(WITH_DEBUG_SHELL), t true TRUE y yes 1))
# bash-minimal is a Dockerfile target that copies over the bash from siderolabs tools
DEBUG_TOOLS_SOURCE := bash-minimal
endif

GO_BUILDFLAGS += -tags "$(GO_BUILDTAGS)"

, := ,
Expand All @@ -160,6 +168,7 @@ COMMON_ARGS += --progress=$(PROGRESS)
COMMON_ARGS += --platform=$(PLATFORM)
COMMON_ARGS += --push=$(PUSH)
COMMON_ARGS += --build-arg=TOOLS=$(TOOLS)
COMMON_ARGS += --build-arg=DEBUG_TOOLS_SOURCE=$(DEBUG_TOOLS_SOURCE)
COMMON_ARGS += --build-arg=PKGS=$(PKGS)
COMMON_ARGS += --build-arg=EXTRAS=$(EXTRAS)
COMMON_ARGS += --build-arg=GOFUMPT_VERSION=$(GOFUMPT_VERSION)
Expand Down
168 changes: 87 additions & 81 deletions cmd/talosctl/cmd/mgmt/cluster/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ const (
controlPlanePortFlag = "control-plane-port"
firewallFlag = "with-firewall"
tpm2EnabledFlag = "with-tpm2"
withDebugShellFlag = "with-debug-shell"

// The following flags are the gen options - the options that are only used in machine configuration (i.e., not during the qemu/docker provisioning).
// They are not applicable when no machine configuration is generated, hence mutually exclusive with the --input-dir flag.
Expand All @@ -107,87 +108,89 @@ const (
)

var (
talosconfig string
nodeImage string
nodeInstallImage string
registryMirrors []string
registryInsecure []string
kubernetesVersion string
nodeVmlinuzPath string
nodeInitramfsPath string
nodeISOPath string
nodeDiskImagePath string
nodeIPXEBootScript string
applyConfigEnabled bool
bootloaderEnabled bool
uefiEnabled bool
tpm2Enabled bool
extraUEFISearchPaths []string
configDebug bool
networkCIDR string
networkNoMasqueradeCIDRs []string
networkMTU int
networkIPv4 bool
networkIPv6 bool
wireguardCIDR string
nameservers []string
dnsDomain string
workers int
controlplanes int
controlPlaneCpus string
workersCpus string
controlPlaneMemory int
workersMemory int
clusterDiskSize int
clusterDiskPreallocate bool
clusterDisks []string
extraDisks int
extraDiskSize int
extraDisksDrivers []string
targetArch string
clusterWait bool
clusterWaitTimeout time.Duration
forceInitNodeAsEndpoint bool
forceEndpoint string
inputDir string
cniBinPath []string
cniConfDir string
cniCacheDir string
cniBundleURL string
ports string
dockerHostIP string
withInitNode bool
customCNIUrl string
crashdumpOnFailure bool
skipKubeconfig bool
skipInjectingConfig bool
talosVersion string
encryptStatePartition bool
encryptEphemeralPartition bool
useVIP bool
enableKubeSpan bool
enableClusterDiscovery bool
configPatch []string
configPatchControlPlane []string
configPatchWorker []string
badRTC bool
extraBootKernelArgs string
dockerDisableIPv6 bool
controlPlanePort int
kubePrismPort int
dhcpSkipHostname bool
skipK8sNodeReadinessCheck bool
networkChaos bool
jitter time.Duration
latency time.Duration
packetLoss float64
packetReorder float64
packetCorrupt float64
bandwidth int
diskEncryptionKeyTypes []string
withFirewall string
withUUIDHostnames bool
withSiderolinkAgent agentFlag
talosconfig string
nodeImage string
nodeInstallImage string
registryMirrors []string
registryInsecure []string
kubernetesVersion string
nodeVmlinuzPath string
nodeInitramfsPath string
nodeISOPath string
nodeDiskImagePath string
nodeIPXEBootScript string
applyConfigEnabled bool
bootloaderEnabled bool
uefiEnabled bool
tpm2Enabled bool
extraUEFISearchPaths []string
configDebug bool
networkCIDR string
networkNoMasqueradeCIDRs []string
networkMTU int
networkIPv4 bool
networkIPv6 bool
wireguardCIDR string
nameservers []string
dnsDomain string
workers int
controlplanes int
controlPlaneCpus string
workersCpus string
controlPlaneMemory int
workersMemory int
clusterDiskSize int
clusterDiskPreallocate bool
clusterDisks []string
extraDisks int
extraDiskSize int
extraDisksDrivers []string
targetArch string
clusterWait bool
clusterWaitTimeout time.Duration
forceInitNodeAsEndpoint bool
forceEndpoint string
inputDir string
cniBinPath []string
cniConfDir string
cniCacheDir string
cniBundleURL string
ports string
dockerHostIP string
withInitNode bool
customCNIUrl string
crashdumpOnFailure bool
skipKubeconfig bool
skipInjectingConfig bool
talosVersion string
encryptStatePartition bool
encryptEphemeralPartition bool
useVIP bool
enableKubeSpan bool
enableClusterDiscovery bool
configPatch []string
configPatchControlPlane []string
configPatchWorker []string
badRTC bool
extraBootKernelArgs string
dockerDisableIPv6 bool
controlPlanePort int
kubePrismPort int
dhcpSkipHostname bool
skipK8sNodeReadinessCheck bool
networkChaos bool
jitter time.Duration
latency time.Duration
packetLoss float64
packetReorder float64
packetCorrupt float64
bandwidth int
diskEncryptionKeyTypes []string
withFirewall string
withUUIDHostnames bool
withSiderolinkAgent agentFlag
debugShellEnabled bool
skipBootPhaseFinishedCheck bool
)

// createCmd represents the cluster up command.
Expand Down Expand Up @@ -468,6 +471,7 @@ func create(ctx context.Context) error {
provision.WithBootlader(bootloaderEnabled),
provision.WithUEFI(uefiEnabled),
provision.WithTPM2(tpm2Enabled),
provision.WithDebugShell(debugShellEnabled),
provision.WithExtraUEFISearchPaths(extraUEFISearchPaths),
provision.WithTargetArch(targetArch),
provision.WithSiderolinkAgent(withSiderolinkAgent.IsEnabled()),
Expand Down Expand Up @@ -1172,6 +1176,8 @@ func init() {
createCmd.Flags().BoolVar(&bootloaderEnabled, bootloaderEnabledFlag, true, "enable bootloader to load kernel and initramfs from disk image after install")
createCmd.Flags().BoolVar(&uefiEnabled, "with-uefi", true, "enable UEFI on x86_64 architecture")
createCmd.Flags().BoolVar(&tpm2Enabled, tpm2EnabledFlag, false, "enable TPM2 emulation support using swtpm")
createCmd.Flags().BoolVar(&debugShellEnabled, withDebugShellFlag, false, "drop talos into a maintenance shell on boot, this is for advanced debugging for developers only")
createCmd.Flags().MarkHidden("with-debug-shell") //nolint:errcheck
createCmd.Flags().StringSliceVar(&extraUEFISearchPaths, "extra-uefi-search-paths", []string{}, "additional search paths for UEFI firmware (only applies when UEFI is enabled)")
createCmd.Flags().StringSliceVar(&registryMirrors, registryMirrorFlag, []string{}, "list of registry mirrors to use in format: <registry host>=<mirror URL>")
createCmd.Flags().StringSliceVar(&registryInsecure, registryInsecureFlag, []string{}, "list of registry hostnames to skip TLS verification for")
Expand Down
8 changes: 8 additions & 0 deletions internal/pkg/mount/switchroot/switchroot.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,14 @@ func Switch(prefix string, mountpoints *mount.Points) (err error) {
log.Printf("race detection enabled with halt_on_error=1")
}

if val := procfs.ProcCmdline().Get("talos.debugshell"); val != nil {
if err = unix.Exec("/bin/bash", []string{"/bin/bash"}, envv); err != nil {
return fmt.Errorf("error executing /bin/bash: %w", err)
}

return nil
}

if err = unix.Exec("/sbin/init", []string{"/sbin/init"}, envv); err != nil {
return fmt.Errorf("error executing /sbin/init: %w", err)
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/provision/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ func WithTPM2(enabled bool) Option {
}
}

// WithDebugShell drops into debug shell in the bootloader.
func WithDebugShell(enabled bool) Option {
return func(o *Options) error {
o.WithDebugShell = enabled

return nil
}
}

// WithExtraUEFISearchPaths configures additional search paths to look for UEFI firmware.
func WithExtraUEFISearchPaths(extraUEFISearchPaths []string) Option {
return func(o *Options) error {
Expand Down Expand Up @@ -157,6 +166,8 @@ type Options struct {
UEFIEnabled bool
// Enable TPM2 emulation using swtpm.
TPM2Enabled bool
// Enable debug shell in the bootloader.
WithDebugShell bool
// Configure additional search paths to look for UEFI firmware.
ExtraUEFISearchPaths []string

Expand Down
9 changes: 9 additions & 0 deletions pkg/provision/providers/qemu/launch.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type LaunchConfig struct {
NodeUUID uuid.UUID
BadRTC bool
ArchitectureData Arch
WithDebugShell bool

// Talos config
Config string
Expand Down Expand Up @@ -320,6 +321,14 @@ func launchVM(config *LaunchConfig) error {
"pause",
}

if config.WithDebugShell {
args = append(
args,
"-serial",
fmt.Sprintf("unix:%s/%s.serial,server,nowait", config.StatePath, config.Hostname),
)
}

var (
scsiAttached, ahciAttached, nvmeAttached bool
ahciBus int
Expand Down
5 changes: 5 additions & 0 deletions pkg/provision/providers/qemu/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe
}
}

if opts.WithDebugShell {
cmdline.Append("talos.debugshell", "")
}

var nodeConfig string

if !nodeReq.SkipInjectingConfig {
Expand Down Expand Up @@ -157,6 +161,7 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe
TFTPServer: nodeReq.TFTPServer,
IPXEBootFileName: nodeReq.IPXEBootFilename,
APIPort: apiPort,
WithDebugShell: opts.WithDebugShell,
}

if clusterReq.IPXEBootScript != "" {
Expand Down
9 changes: 9 additions & 0 deletions pkg/provision/providers/qemu/preflight.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func (p *provisioner) preflightChecks(ctx context.Context, request provision.Clu
checkContext.qemuExecutable,
checkContext.checkFlashImages,
checkContext.swtpmExecutable,
checkContext.numberOfNodesWhenDebugShellEnabled,
checkContext.cniDirectories,
checkContext.cniBundle,
checkContext.checkIptables,
Expand Down Expand Up @@ -113,6 +114,14 @@ func (check *preflightCheckContext) swtpmExecutable(ctx context.Context) error {
return nil
}

func (check *preflightCheckContext) numberOfNodesWhenDebugShellEnabled(ctx context.Context) error {
if check.options.WithDebugShell && len(check.request.Nodes.ControlPlaneNodes())+len(check.request.Nodes.WorkerNodes()) > 1 {
return fmt.Errorf("error: --with-debug-shell is not supported with more than one node")
}

return nil
}

func (check *preflightCheckContext) cniDirectories(ctx context.Context) error {
cniDirs := append([]string{}, check.request.Network.CNI.BinPath...)
cniDirs = append(cniDirs, check.request.Network.CNI.CacheDir, check.request.Network.CNI.ConfDir)
Expand Down

0 comments on commit 70da315

Please sign in to comment.