diff --git a/Dockerfile b/Dockerfile index d3ce9f8d143..72b4d5be0f7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,6 +42,8 @@ ARG PKG_CNI ARG PKG_FLANNEL_CNI ARG PKG_TALOSCTL_CNI_BUNDLE_INSTALL +ARG DEBUG_TOOLS_SOURCE + # Resolve package images using ${PKGS} to be used later in COPY --from=. FROM ${PKG_FHS} AS pkg-fhs @@ -140,6 +142,17 @@ FROM ${PKG_KERNEL} AS pkg-kernel FROM --platform=amd64 ${PKG_KERNEL} AS pkg-kernel-amd64 FROM --platform=arm64 ${PKG_KERNEL} AS pkg-kernel-arm64 +FROM --platform=amd64 ${TOOLS} as tools-amd64 +FROM --platform=arm64 ${TOOLS} as tools-arm64 + +FROM scratch as pkg-debug-tools-amd64 +COPY --from=tools-amd64 /toolchain/bin/bash /toolchain/bin/bash +COPY --from=tools-amd64 /toolchain/lib/ld-musl-x86_64.so.1 /toolchain/toolchain/lib/ld-musl-x86_64.so.1 + +FROM scratch as pkg-debug-tools-arm64 +COPY --from=tools-arm64 /toolchain/bin/bash /bin/bash +COPY --from=tools-arm64 /toolchain/lib/ld-musl-aarch64.so.1 /toolchain/lib/ld-musl-aarch64.so.1 + # Strip CNI package. FROM scratch AS pkg-cni-stripped-amd64 @@ -658,6 +671,10 @@ COPY --link --from=pkg-kmod-amd64 /usr/lib/libkmod.* /rootfs/lib/ COPY --link --from=pkg-kmod-amd64 /usr/bin/kmod /rootfs/sbin/modprobe COPY --link --from=modules-amd64 /lib/modules /rootfs/lib/modules COPY --link --from=machined-build-amd64 /machined /rootfs/sbin/init + +# this is a no-op as it copies from a scratch image when WITH_DEBUG_SHELL is not set +COPY --link --from=pkg-debug-tools-amd64 * /rootfs/ + RUN </dev/nu ARTIFACTS := _out TOOLS ?= ghcr.io/siderolabs/tools:v1.9.0-alpha.0-3-g1151610 +DEBUG_TOOLS_SOURCE := scratch + PKGS_PREFIX ?= ghcr.io/siderolabs PKGS ?= v1.9.0-alpha.0-18-gba0341e EXTRAS ?= v1.9.0-alpha.0 @@ -147,6 +149,12 @@ GO_LDFLAGS += -s -w endif GO_BUILDFLAGS_TALOSCTL := $(GO_BUILDFLAGS) -tags "$(GO_BUILDTAGS_TALOSCTL)" + +ifneq (, $(filter $(WITH_DEBUG_SHELL), t true TRUE y yes 1)) +# bash-minimal is a Dockerfile target that copies over the bash from siderolabs tools +DEBUG_TOOLS_SOURCE := bash-minimal +endif + GO_BUILDFLAGS += -tags "$(GO_BUILDTAGS)" , := , @@ -160,6 +168,7 @@ COMMON_ARGS += --progress=$(PROGRESS) COMMON_ARGS += --platform=$(PLATFORM) COMMON_ARGS += --push=$(PUSH) COMMON_ARGS += --build-arg=TOOLS=$(TOOLS) +COMMON_ARGS += --build-arg=DEBUG_TOOLS_SOURCE=$(DEBUG_TOOLS_SOURCE) COMMON_ARGS += --build-arg=PKGS=$(PKGS) COMMON_ARGS += --build-arg=EXTRAS=$(EXTRAS) COMMON_ARGS += --build-arg=GOFUMPT_VERSION=$(GOFUMPT_VERSION) diff --git a/cmd/talosctl/cmd/mgmt/cluster/create.go b/cmd/talosctl/cmd/mgmt/cluster/create.go index efdd3bf6f39..dd8422d22aa 100644 --- a/cmd/talosctl/cmd/mgmt/cluster/create.go +++ b/cmd/talosctl/cmd/mgmt/cluster/create.go @@ -86,6 +86,7 @@ const ( controlPlanePortFlag = "control-plane-port" firewallFlag = "with-firewall" tpm2EnabledFlag = "with-tpm2" + withDebugShellFlag = "with-debug-shell" // The following flags are the gen options - the options that are only used in machine configuration (i.e., not during the qemu/docker provisioning). // They are not applicable when no machine configuration is generated, hence mutually exclusive with the --input-dir flag. @@ -107,87 +108,89 @@ const ( ) var ( - talosconfig string - nodeImage string - nodeInstallImage string - registryMirrors []string - registryInsecure []string - kubernetesVersion string - nodeVmlinuzPath string - nodeInitramfsPath string - nodeISOPath string - nodeDiskImagePath string - nodeIPXEBootScript string - applyConfigEnabled bool - bootloaderEnabled bool - uefiEnabled bool - tpm2Enabled bool - extraUEFISearchPaths []string - configDebug bool - networkCIDR string - networkNoMasqueradeCIDRs []string - networkMTU int - networkIPv4 bool - networkIPv6 bool - wireguardCIDR string - nameservers []string - dnsDomain string - workers int - controlplanes int - controlPlaneCpus string - workersCpus string - controlPlaneMemory int - workersMemory int - clusterDiskSize int - clusterDiskPreallocate bool - clusterDisks []string - extraDisks int - extraDiskSize int - extraDisksDrivers []string - targetArch string - clusterWait bool - clusterWaitTimeout time.Duration - forceInitNodeAsEndpoint bool - forceEndpoint string - inputDir string - cniBinPath []string - cniConfDir string - cniCacheDir string - cniBundleURL string - ports string - dockerHostIP string - withInitNode bool - customCNIUrl string - crashdumpOnFailure bool - skipKubeconfig bool - skipInjectingConfig bool - talosVersion string - encryptStatePartition bool - encryptEphemeralPartition bool - useVIP bool - enableKubeSpan bool - enableClusterDiscovery bool - configPatch []string - configPatchControlPlane []string - configPatchWorker []string - badRTC bool - extraBootKernelArgs string - dockerDisableIPv6 bool - controlPlanePort int - kubePrismPort int - dhcpSkipHostname bool - skipK8sNodeReadinessCheck bool - networkChaos bool - jitter time.Duration - latency time.Duration - packetLoss float64 - packetReorder float64 - packetCorrupt float64 - bandwidth int - diskEncryptionKeyTypes []string - withFirewall string - withUUIDHostnames bool - withSiderolinkAgent agentFlag + talosconfig string + nodeImage string + nodeInstallImage string + registryMirrors []string + registryInsecure []string + kubernetesVersion string + nodeVmlinuzPath string + nodeInitramfsPath string + nodeISOPath string + nodeDiskImagePath string + nodeIPXEBootScript string + applyConfigEnabled bool + bootloaderEnabled bool + uefiEnabled bool + tpm2Enabled bool + extraUEFISearchPaths []string + configDebug bool + networkCIDR string + networkNoMasqueradeCIDRs []string + networkMTU int + networkIPv4 bool + networkIPv6 bool + wireguardCIDR string + nameservers []string + dnsDomain string + workers int + controlplanes int + controlPlaneCpus string + workersCpus string + controlPlaneMemory int + workersMemory int + clusterDiskSize int + clusterDiskPreallocate bool + clusterDisks []string + extraDisks int + extraDiskSize int + extraDisksDrivers []string + targetArch string + clusterWait bool + clusterWaitTimeout time.Duration + forceInitNodeAsEndpoint bool + forceEndpoint string + inputDir string + cniBinPath []string + cniConfDir string + cniCacheDir string + cniBundleURL string + ports string + dockerHostIP string + withInitNode bool + customCNIUrl string + crashdumpOnFailure bool + skipKubeconfig bool + skipInjectingConfig bool + talosVersion string + encryptStatePartition bool + encryptEphemeralPartition bool + useVIP bool + enableKubeSpan bool + enableClusterDiscovery bool + configPatch []string + configPatchControlPlane []string + configPatchWorker []string + badRTC bool + extraBootKernelArgs string + dockerDisableIPv6 bool + controlPlanePort int + kubePrismPort int + dhcpSkipHostname bool + skipK8sNodeReadinessCheck bool + networkChaos bool + jitter time.Duration + latency time.Duration + packetLoss float64 + packetReorder float64 + packetCorrupt float64 + bandwidth int + diskEncryptionKeyTypes []string + withFirewall string + withUUIDHostnames bool + withSiderolinkAgent agentFlag + debugShellEnabled bool + skipBootPhaseFinishedCheck bool ) // createCmd represents the cluster up command. @@ -468,6 +471,7 @@ func create(ctx context.Context) error { provision.WithBootlader(bootloaderEnabled), provision.WithUEFI(uefiEnabled), provision.WithTPM2(tpm2Enabled), + provision.WithDebugShell(debugShellEnabled), provision.WithExtraUEFISearchPaths(extraUEFISearchPaths), provision.WithTargetArch(targetArch), provision.WithSiderolinkAgent(withSiderolinkAgent.IsEnabled()), @@ -1172,6 +1176,8 @@ func init() { createCmd.Flags().BoolVar(&bootloaderEnabled, bootloaderEnabledFlag, true, "enable bootloader to load kernel and initramfs from disk image after install") createCmd.Flags().BoolVar(&uefiEnabled, "with-uefi", true, "enable UEFI on x86_64 architecture") createCmd.Flags().BoolVar(&tpm2Enabled, tpm2EnabledFlag, false, "enable TPM2 emulation support using swtpm") + createCmd.Flags().BoolVar(&debugShellEnabled, withDebugShellFlag, false, "drop talos into a maintenance shell on boot, this is for advanced debugging for developers only") + createCmd.Flags().MarkHidden("with-debug-shell") //nolint:errcheck createCmd.Flags().StringSliceVar(&extraUEFISearchPaths, "extra-uefi-search-paths", []string{}, "additional search paths for UEFI firmware (only applies when UEFI is enabled)") createCmd.Flags().StringSliceVar(®istryMirrors, registryMirrorFlag, []string{}, "list of registry mirrors to use in format: =") createCmd.Flags().StringSliceVar(®istryInsecure, registryInsecureFlag, []string{}, "list of registry hostnames to skip TLS verification for") diff --git a/internal/pkg/mount/switchroot/switchroot.go b/internal/pkg/mount/switchroot/switchroot.go index 846b10321bf..3ea459f65f6 100644 --- a/internal/pkg/mount/switchroot/switchroot.go +++ b/internal/pkg/mount/switchroot/switchroot.go @@ -108,6 +108,14 @@ func Switch(prefix string, mountpoints *mount.Points) (err error) { log.Printf("race detection enabled with halt_on_error=1") } + if val := procfs.ProcCmdline().Get("talos.debugshell"); val != nil { + if err = unix.Exec("/bin/bash", []string{"/bin/bash"}, envv); err != nil { + return fmt.Errorf("error executing /bin/bash: %w", err) + } + + return nil + } + if err = unix.Exec("/sbin/init", []string{"/sbin/init"}, envv); err != nil { return fmt.Errorf("error executing /sbin/init: %w", err) } diff --git a/pkg/provision/options.go b/pkg/provision/options.go index 502d591d26f..c742adaf58d 100644 --- a/pkg/provision/options.go +++ b/pkg/provision/options.go @@ -79,6 +79,15 @@ func WithTPM2(enabled bool) Option { } } +// WithDebugShell drops into debug shell in the bootloader. +func WithDebugShell(enabled bool) Option { + return func(o *Options) error { + o.WithDebugShell = enabled + + return nil + } +} + // WithExtraUEFISearchPaths configures additional search paths to look for UEFI firmware. func WithExtraUEFISearchPaths(extraUEFISearchPaths []string) Option { return func(o *Options) error { @@ -157,6 +166,8 @@ type Options struct { UEFIEnabled bool // Enable TPM2 emulation using swtpm. TPM2Enabled bool + // Enable debug shell in the bootloader. + WithDebugShell bool // Configure additional search paths to look for UEFI firmware. ExtraUEFISearchPaths []string diff --git a/pkg/provision/providers/qemu/launch.go b/pkg/provision/providers/qemu/launch.go index 1725a13af6e..f03230c0bc9 100644 --- a/pkg/provision/providers/qemu/launch.go +++ b/pkg/provision/providers/qemu/launch.go @@ -56,6 +56,7 @@ type LaunchConfig struct { NodeUUID uuid.UUID BadRTC bool ArchitectureData Arch + WithDebugShell bool // Talos config Config string @@ -320,6 +321,14 @@ func launchVM(config *LaunchConfig) error { "pause", } + if config.WithDebugShell { + args = append( + args, + "-serial", + fmt.Sprintf("unix:%s/%s.serial,server,nowait", config.StatePath, config.Hostname), + ) + } + var ( scsiAttached, ahciAttached, nvmeAttached bool ahciBus int diff --git a/pkg/provision/providers/qemu/node.go b/pkg/provision/providers/qemu/node.go index e7f60e725e4..472a40b9901 100644 --- a/pkg/provision/providers/qemu/node.go +++ b/pkg/provision/providers/qemu/node.go @@ -89,6 +89,10 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe } } + if opts.WithDebugShell { + cmdline.Append("talos.debugshell", "") + } + var nodeConfig string if !nodeReq.SkipInjectingConfig { @@ -157,6 +161,7 @@ func (p *provisioner) createNode(state *vm.State, clusterReq provision.ClusterRe TFTPServer: nodeReq.TFTPServer, IPXEBootFileName: nodeReq.IPXEBootFilename, APIPort: apiPort, + WithDebugShell: opts.WithDebugShell, } if clusterReq.IPXEBootScript != "" { diff --git a/pkg/provision/providers/qemu/preflight.go b/pkg/provision/providers/qemu/preflight.go index d71fe895e10..a15d83809de 100644 --- a/pkg/provision/providers/qemu/preflight.go +++ b/pkg/provision/providers/qemu/preflight.go @@ -34,6 +34,7 @@ func (p *provisioner) preflightChecks(ctx context.Context, request provision.Clu checkContext.qemuExecutable, checkContext.checkFlashImages, checkContext.swtpmExecutable, + checkContext.numberOfNodesWhenDebugShellEnabled, checkContext.cniDirectories, checkContext.cniBundle, checkContext.checkIptables, @@ -113,6 +114,14 @@ func (check *preflightCheckContext) swtpmExecutable(ctx context.Context) error { return nil } +func (check *preflightCheckContext) numberOfNodesWhenDebugShellEnabled(ctx context.Context) error { + if check.options.WithDebugShell && len(check.request.Nodes.ControlPlaneNodes())+len(check.request.Nodes.WorkerNodes()) > 1 { + return fmt.Errorf("error: --with-debug-shell is not supported with more than one node") + } + + return nil +} + func (check *preflightCheckContext) cniDirectories(ctx context.Context) error { cniDirs := append([]string{}, check.request.Network.CNI.BinPath...) cniDirs = append(cniDirs, check.request.Network.CNI.CacheDir, check.request.Network.CNI.ConfDir)