From 32a26a711b31f482a8ec0d9d96e3f8d7a113a2ce Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 7 Sep 2023 04:30:28 +0000 Subject: [PATCH 01/21] build(deps): bump github.com/cyphar/filepath-securejoin Bumps [github.com/cyphar/filepath-securejoin](https://github.com/cyphar/filepath-securejoin) from 0.2.3 to 0.2.4. - [Release notes](https://github.com/cyphar/filepath-securejoin/releases) - [Commits](https://github.com/cyphar/filepath-securejoin/compare/v0.2.3...v0.2.4) --- updated-dependencies: - dependency-name: github.com/cyphar/filepath-securejoin dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] (cherry picked from commit c7ad2749fde1445b627b53a568aa85e3ca50fbd2) Signed-off-by: Sebastiaan van Stijn --- go.mod | 2 +- go.sum | 4 ++-- .../cyphar/filepath-securejoin/.travis.yml | 21 ------------------- .../cyphar/filepath-securejoin/README.md | 2 +- .../cyphar/filepath-securejoin/VERSION | 2 +- .../cyphar/filepath-securejoin/join.go | 12 ++++++++++- vendor/modules.txt | 2 +- 7 files changed, 17 insertions(+), 28 deletions(-) delete mode 100644 vendor/github.com/cyphar/filepath-securejoin/.travis.yml diff --git a/go.mod b/go.mod index 16d0f31bd87..f51b6432da2 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/cilium/ebpf v0.7.0 github.com/containerd/console v1.0.3 github.com/coreos/go-systemd/v22 v22.3.2 - github.com/cyphar/filepath-securejoin v0.2.3 + github.com/cyphar/filepath-securejoin v0.2.4 github.com/docker/go-units v0.4.0 github.com/godbus/dbus/v5 v5.0.6 github.com/moby/sys/mountinfo v0.5.0 diff --git a/go.sum b/go.sum index ceee93a85ec..ecabd3981c5 100644 --- a/go.sum +++ b/go.sum @@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzA github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= -github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= -github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= +github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= +github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= diff --git a/vendor/github.com/cyphar/filepath-securejoin/.travis.yml b/vendor/github.com/cyphar/filepath-securejoin/.travis.yml deleted file mode 100644 index b94ff8cf92a..00000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/.travis.yml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2017 SUSE LLC. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -language: go -go: - - 1.13.x - - 1.16.x - - tip -arch: - - AMD64 - - ppc64le -os: - - linux - - osx - -script: - - go test -cover -v ./... - -notifications: - email: false diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md index 3624617c89b..4eca0f23550 100644 --- a/vendor/github.com/cyphar/filepath-securejoin/README.md +++ b/vendor/github.com/cyphar/filepath-securejoin/README.md @@ -1,6 +1,6 @@ ## `filepath-securejoin` ## -[![Build Status](https://travis-ci.org/cyphar/filepath-securejoin.svg?branch=master)](https://travis-ci.org/cyphar/filepath-securejoin) +[![Build Status](https://github.com/cyphar/filepath-securejoin/actions/workflows/ci.yml/badge.svg)](https://github.com/cyphar/filepath-securejoin/actions/workflows/ci.yml) An implementation of `SecureJoin`, a [candidate for inclusion in the Go standard library][go#20126]. The purpose of this function is to be a "secure" diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION index 7179039691c..abd410582de 100644 --- a/vendor/github.com/cyphar/filepath-securejoin/VERSION +++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION @@ -1 +1 @@ -0.2.3 +0.2.4 diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go index 7dd08dbbdf7..aa32b85fb84 100644 --- a/vendor/github.com/cyphar/filepath-securejoin/join.go +++ b/vendor/github.com/cyphar/filepath-securejoin/join.go @@ -39,17 +39,27 @@ func IsNotExist(err error) bool { // components in the returned string are not modified (in other words are not // replaced with symlinks on the filesystem) after this function has returned. // Such a symlink race is necessarily out-of-scope of SecureJoin. +// +// Volume names in unsafePath are always discarded, regardless if they are +// provided via direct input or when evaluating symlinks. Therefore: +// +// "C:\Temp" + "D:\path\to\file.txt" results in "C:\Temp\path\to\file.txt" func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) { // Use the os.* VFS implementation if none was specified. if vfs == nil { vfs = osVFS{} } + unsafePath = filepath.FromSlash(unsafePath) var path bytes.Buffer n := 0 for unsafePath != "" { if n > 255 { - return "", &os.PathError{Op: "SecureJoin", Path: root + "/" + unsafePath, Err: syscall.ELOOP} + return "", &os.PathError{Op: "SecureJoin", Path: root + string(filepath.Separator) + unsafePath, Err: syscall.ELOOP} + } + + if v := filepath.VolumeName(unsafePath); v != "" { + unsafePath = unsafePath[len(v):] } // Next path component, p. diff --git a/vendor/modules.txt b/vendor/modules.txt index 5f48b38d49a..a5537dfe33c 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -20,7 +20,7 @@ github.com/coreos/go-systemd/v22/dbus # github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d ## explicit; go 1.12 github.com/cpuguy83/go-md2man/v2/md2man -# github.com/cyphar/filepath-securejoin v0.2.3 +# github.com/cyphar/filepath-securejoin v0.2.4 ## explicit; go 1.13 github.com/cyphar/filepath-securejoin # github.com/docker/go-units v0.4.0 From 87792ce02cd5b873aef5d44c855cf200885d0ab8 Mon Sep 17 00:00:00 2001 From: Heran Yang Date: Sat, 9 Sep 2023 15:40:39 +0800 Subject: [PATCH 02/21] libct/cg: add swapOnlyUsage in MemoryStats This field reports swap-only usage. For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage` are set. This commit also export `MaxUsage` of memory under cgroupv2 mode, using `memory.peak` introduced in kernel 5.19. Signed-off-by: Heran Yang (cherry picked from commit 104b8dc951ee88628f9fbf3222961bb2de8b0294) Signed-off-by: Harshal Patil --- CHANGELOG.md | 4 ++++ libcontainer/cgroups/fs/memory.go | 4 ++++ libcontainer/cgroups/fs/memory_test.go | 13 +++++++------ libcontainer/cgroups/fs2/memory.go | 18 ++++++++++++++++-- libcontainer/cgroups/fs2/memory_test.go | 16 ++++++++++++++++ libcontainer/cgroups/stats.go | 2 ++ 6 files changed, 49 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 517e9d8abfd..13d2ea0b4bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,6 +46,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This aligns cgroupv2 root usage more closely with cgroupv1 reporting. Additionally, report root swap usage as sum of swap and memory usage, aligned with v1 and existing non-root v2 reporting. (#3933) + * Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage. + For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage + from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage` + are set. (#4010) ## [1.1.8] - 2023-07-20 diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index a0e78074980..783566d68f0 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { return err } stats.MemoryStats.SwapUsage = swapUsage + stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ + Usage: swapUsage.Usage - memoryUsage.Usage, + Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, + } kernelUsage, err := getMemoryData(path, "kmem") if err != nil { return err diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go index d305a62a393..95e9d3cbaa4 100644 --- a/libcontainer/cgroups/fs/memory_test.go +++ b/libcontainer/cgroups/fs/memory_test.go @@ -249,12 +249,13 @@ func TestMemoryStats(t *testing.T) { t.Fatal(err) } expectedStats := cgroups.MemoryStats{ - Cache: 512, - Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - Stats: map[string]uint64{"cache": 512, "rss": 1024}, - UseHierarchy: true, + Cache: 512, + Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0}, + KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + Stats: map[string]uint64{"cache": 512, "rss": 1024}, + UseHierarchy: true, PageUsageByNUMA: cgroups.PageUsageByNUMA{ PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go index 9cca98c4c0a..01fe7d8e12d 100644 --- a/libcontainer/cgroups/fs2/memory.go +++ b/libcontainer/cgroups/fs2/memory.go @@ -100,7 +100,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { - // The root cgroup does not have memory.{current,max} + // The root cgroup does not have memory.{current,max,peak} // so emulate those using data from /proc/meminfo and // /sys/fs/cgroup/memory.stat return rootStatsFromMeminfo(stats) @@ -108,10 +108,12 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { return err } stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryDataV2(dirPath, "swap") + swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") if err != nil { return err } + stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage + swapUsage := swapOnlyUsage // As cgroup v1 reports SwapUsage values as mem+swap combined, // while in cgroup v2 swap values do not include memory, // report combined mem+swap for v1 compatibility. @@ -119,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { if swapUsage.Limit != math.MaxUint64 { swapUsage.Limit += memoryUsage.Limit } + // The `MaxUsage` of mem+swap cannot simply combine mem with + // swap. So set it to 0 for v1 compatibility. + swapUsage.MaxUsage = 0 stats.MemoryStats.SwapUsage = swapUsage return nil @@ -133,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } usage := moduleName + ".current" limit := moduleName + ".max" + maxUsage := moduleName + ".peak" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { @@ -152,6 +158,14 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } memoryData.Limit = value + // `memory.peak` since kernel 5.19 + // `memory.swap.peak` since kernel 6.5 + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil && !os.IsNotExist(err) { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + return memoryData, nil } diff --git a/libcontainer/cgroups/fs2/memory_test.go b/libcontainer/cgroups/fs2/memory_test.go index 2e2713c29ae..89c999d0cee 100644 --- a/libcontainer/cgroups/fs2/memory_test.go +++ b/libcontainer/cgroups/fs2/memory_test.go @@ -94,6 +94,10 @@ func TestStatMemoryPodCgroup(t *testing.T) { t.Fatal(err) } + if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil { + t.Fatal(err) + } + gotStats := cgroups.NewStats() // use a fake root path to trigger the pod cgroup lookup. @@ -107,6 +111,18 @@ func TestStatMemoryPodCgroup(t *testing.T) { if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes) } + + // result should be "memory.max" + var expectedLimitBytes uint64 = 999999999 + if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes) + } + + // result should be "memory.peak" + var expectedMaxUsageBytes uint64 = 987654321 + if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes) + } } func TestRootStatsFromMeminfo(t *testing.T) { diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index 40a81dd5a86..0d8371b05f1 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -78,6 +78,8 @@ type MemoryStats struct { Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of swap only + SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` // usage of kernel memory KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory From 0c8e2cc602bab3c8c1127b50aa26fa6f6954e58b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 21 Nov 2023 00:36:28 +1100 Subject: [PATCH 03/21] *: actually support joining a userns with a new container (This is a cherry-pick of 1912d5988bbb379189ea9ceb2e03945738c513dc.) Our handling for name space paths with user namespaces has been broken for a long time. In particular, the need to parse /proc/self/*id_map in quite a few places meant that we would treat userns configurations that had a namespace path as if they were a userns configuration without mappings, resulting in errors. The primary issue was down to the id translation helper functions, which could only handle configurations that had explicit mappings. Obviously, when joining a user namespace we need to map the ids but figuring out the correct mapping is non-trivial in comparison. In order to get the mapping, you need to read /proc//*id_map of a process inside the userns -- while most userns paths will be of the form /proc//ns/user (and we have a fast-path for this case), this is not guaranteed and thus it is necessary to spawn a process inside the container and read its /proc//*id_map files in the general case. As Go does not allow us spawn a subprocess into a target userns, we have to use CGo to fork a sub-process which does the setns(2). To be honest, this is a little dodgy in regards to POSIX signal-safety(7) but since we do no allocations and we are executing in the forked context from a Go program (not a C program), it should be okay. The other alternative would be to do an expensive re-exec (a-la nsexec which would make several other bits of runc more complicated), or to use nsenter(1) which might not exist on the system and is less than ideal. Because we need to logically remap users quite a few times in runc (including in "runc init", where joining the namespace is not feasable), we cache the mapping inside the libcontainer config struct. A future patch will make sure that we stop allow invalid user configurations where a mapping is specified as well as a userns path to join. Finally, add an integration test to make sure we don't regress this again. Signed-off-by: Aleksa Sarai --- libcontainer/configs/validate/rootless.go | 31 ++-- libcontainer/specconv/spec_linux.go | 16 ++ libcontainer/userns/userns_maps.c | 79 ++++++++++ libcontainer/userns/userns_maps_linux.go | 171 ++++++++++++++++++++++ tests/integration/userns.bats | 83 ++++++++++- 5 files changed, 360 insertions(+), 20 deletions(-) create mode 100644 libcontainer/userns/userns_maps.c create mode 100644 libcontainer/userns/userns_maps_linux.go diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go index 9a6e5eb32a3..37c383366fe 100644 --- a/libcontainer/configs/validate/rootless.go +++ b/libcontainer/configs/validate/rootless.go @@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { return nil } -func hasIDMapping(id int, mappings []configs.IDMap) bool { - for _, m := range mappings { - if id >= m.ContainerID && id < m.ContainerID+m.Size { - return true - } - } - return false -} - func rootlessEUIDMappings(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWUSER) { return errors.New("rootless container requires user namespaces") } - - if len(config.UidMappings) == 0 { - return errors.New("rootless containers requires at least one UID mapping") - } - if len(config.GidMappings) == 0 { - return errors.New("rootless containers requires at least one GID mapping") + // We only require mappings if we are not joining another userns. + if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" { + if len(config.UidMappings) == 0 { + return errors.New("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return errors.New("rootless containers requires at least one GID mapping") + } } return nil } @@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(uid, config.UidMappings) { - return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") + if _, err := config.HostUID(uid); err != nil { + return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err) } } @@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(gid, config.GidMappings) { - return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") + if _, err := config.HostGID(gid); err != nil { + return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err) } } } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 1b358b24b4d..bef6a51df42 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/userns" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -939,6 +940,21 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { config.GidMappings = append(config.GidMappings, create(m)) } } + if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // Cache the current userns mappings in our configuration, so that we + // can calculate uid and gid mappings within runc. These mappings are + // never used for configuring the container if the path is set. + uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) + if err != nil { + return fmt.Errorf("failed to cache mappings for userns: %w", err) + } + config.UidMappings = uidMap + config.GidMappings = gidMap + logrus.WithFields(logrus.Fields{ + "uid_map": uidMap, + "gid_map": gidMap, + }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") + } rootUID, err := config.HostRootUID() if err != nil { return err diff --git a/libcontainer/userns/userns_maps.c b/libcontainer/userns/userns_maps.c new file mode 100644 index 00000000000..84f2c6188c3 --- /dev/null +++ b/libcontainer/userns/userns_maps.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* + * All of the code here is run inside an aync-signal-safe context, so we need + * to be careful to not call any functions that could cause issues. In theory, + * since we are a Go program, there are fewer restrictions in practice, it's + * better to be safe than sorry. + * + * The only exception is exit, which we need to call to make sure we don't + * return into runc. + */ + +void bail(int pipefd, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vdprintf(pipefd, fmt, args); + va_end(args); + + exit(1); +} + +int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) +{ + char buffer[4096] = { 0 }; + + pid_t child = fork(); + if (child != 0) + return child; + /* in child */ + + /* Join the target userns. */ + int nsfd = open(userns_path, O_RDONLY); + if (nsfd < 0) + bail(errfd, "open userns path %s failed: %m", userns_path); + + int err = setns(nsfd, CLONE_NEWUSER); + if (err < 0) + bail(errfd, "setns %s failed: %m", userns_path); + + close(nsfd); + + /* Pipe the requested file contents. */ + int fd = open(path, O_RDONLY); + if (fd < 0) + bail(errfd, "open %s in userns %s failed: %m", path, userns_path); + + int nread, ntotal = 0; + while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { + if (nread < 0) + bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); + ntotal += nread; + + int nwritten = 0; + while (nwritten < nread) { + int n = write(outfd, buffer, nread - nwritten); + if (n < 0) + bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", + nread - nwritten, path, nwritten); + nwritten += n; + } + if (nread != nwritten) + bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); + } + + close(fd); + close(outfd); + close(errfd); + + /* We must exit here, otherwise we would return into a forked runc. */ + exit(0); +} diff --git a/libcontainer/userns/userns_maps_linux.go b/libcontainer/userns/userns_maps_linux.go new file mode 100644 index 00000000000..d81290de2cb --- /dev/null +++ b/libcontainer/userns/userns_maps_linux.go @@ -0,0 +1,171 @@ +//go:build linux + +package userns + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +/* +#include +extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); +*/ +import "C" + +func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + var m configs.IDMap + line := scanner.Text() + if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { + return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) + } + ms = append(ms, m) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("parsing id map failed: %w", err) + } + return ms, nil +} + +// Do something equivalent to nsenter --user= cat , but more +// efficiently. Returns the contents of the requested file from within the user +// namespace. +func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) + } + defer rdr.Close() + defer wtr.Close() + + errRdr, errWtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) + } + defer errRdr.Close() + defer errWtr.Close() + + cNsPath := C.CString(nsPath) + defer C.free(unsafe.Pointer(cNsPath)) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) + + if childPid < 0 { + return nil, fmt.Errorf("failed to spawn fork for userns") + } else if childPid == 0 { + // this should never happen + panic("runc executing inside fork child -- unsafe state!") + } + + // We are in the parent -- close the write end of the pipe before reading. + wtr.Close() + output, err := io.ReadAll(rdr) + rdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn failed: %w", err) + } + + // Ditto for the error pipe. + errWtr.Close() + errOutput, err := io.ReadAll(errRdr) + errRdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) + } + errOutput = bytes.TrimSpace(errOutput) + + // Clean up the child. + child, err := os.FindProcess(int(childPid)) + if err != nil { + return nil, fmt.Errorf("could not find userns spawn process: %w", err) + } + state, err := child.Wait() + if err != nil { + return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) + } + if !state.Success() { + errStr := string(errOutput) + if errStr == "" { + errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) + } + return nil, fmt.Errorf("userns spawn: %s", errStr) + } else if len(errOutput) > 0 { + // We can just ignore weird output in the error pipe if the process + // didn't bail(), but for completeness output for debugging. + logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) + } + // The subprocess succeeded, return whatever it wrote to the pipe. + return output, nil +} + +func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { + var ( + pid int + extra rune + tryFastPath bool + ) + + // nsPath is usually of the form /proc//ns/user, which means that we + // already have a pid that is part of the user namespace and thus we can + // just use the pid to read from /proc//*id_map. + // + // Note that Sscanf doesn't consume the whole input, so we check for any + // trailing data with %c. That way, we can be sure the pattern matched + // /proc/$pid/ns/user _exactly_ iff n === 1. + if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { + tryFastPath = pid > 0 + } + + for _, mapType := range []struct { + name string + idMap *[]configs.IDMap + }{ + {"uid_map", &uidMap}, + {"gid_map", &gidMap}, + } { + var mapData []byte + + if tryFastPath { + path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) + data, err := os.ReadFile(path) + if err != nil { + // Do not error out here -- we need to try the slow path if the + // fast path failed. + logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) + } else { + mapData = data + } + } else { + logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) + } + + if mapData == nil { + // We have to actually join the namespace if we cannot take the + // fast path. The path is resolved with respect to the child + // process, so just use /proc/self. + data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) + if err != nil { + return nil, nil, err + } + mapData = data + } + idMap, err := parseIdmapData(mapData) + if err != nil { + return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) + } + *mapType.idMap = idMap + } + + return uidMap, gidMap, nil +} diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats index eca1e0ce58a..2329119a2fe 100644 --- a/tests/integration/userns.bats +++ b/tests/integration/userns.bats @@ -15,15 +15,25 @@ function setup() { mkdir -p rootfs/{proc,sys,tmp} mkdir -p rootfs/tmp/mount-{1,2} + to_umount_list="$(mktemp "$BATS_RUN_TMPDIR/userns-mounts.XXXXXX")" if [ "$ROOTLESS" -eq 0 ]; then update_config ' .linux.namespaces += [{"type": "user"}] | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] - | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + | .linux.gidMappings += [{"hostID": 200000, "containerID": 0, "size": 65534}] ' fi } function teardown() { teardown_bundle + + if [ -v to_umount_list ]; then + while read -r mount_path; do + umount -l "$mount_path" || : + rm -f "$mount_path" + done <"$to_umount_list" + rm -f "$to_umount_list" + unset to_umount_list + fi } @test "userns with simple mount" { @@ -83,3 +93,74 @@ function teardown() { runc exec test_busybox cat /sys/fs/cgroup/{pids,memory}/tasks [ "$status" -eq 0 ] } + +@test "userns join other container userns" { + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Configure our container to attach to the first container's userns. + target_pid="$(__runc state target_userns | jq .pid)" + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "/proc/'"$target_pid"'/ns/" + .type) else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} + +@test "userns join other container userns [bind-mounted nsfd]" { + requires root + + # Create a detached container with the id-mapping we want. + update_config '.process.args = ["sleep", "infinity"]' + runc run -d --console-socket "$CONSOLE_SOCKET" target_userns + [ "$status" -eq 0 ] + + # Bind-mount the first containers userns nsfd to a different path, to + # exercise the non-fast-path (where runc has to join the userns to get the + # mappings). + target_pid="$(__runc state target_userns | jq .pid)" + userns_path=$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX") + mount --bind "/proc/$target_pid/ns/user" "$userns_path" + echo "$userns_path" >>"$to_umount_list" + + # Configure our container to attach to the first container's userns. + update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end) + | del(.linux.uidMappings) + | del(.linux.gidMappings)' + runc run -d --console-socket "$CONSOLE_SOCKET" in_userns + [ "$status" -eq 0 ] + + runc exec in_userns cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + + runc exec in_userns cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+65534$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} From 8f8cb45572c7574325c89ffdf8f5b8112a4e7a97 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 21 Nov 2023 17:20:36 +1100 Subject: [PATCH 04/21] configs: disallow ambiguous userns and timens configurations (This is a cherry-pick of 09822c3da8ad8fa91b8796c5abf27ef06814a0c3.) For userns and timens, the mappings (and offsets, respectively) cannot be changed after the namespace is first configured. Thus, configuring a container with a namespace path to join means that you cannot also provide configuration for said namespace. Previously we would silently ignore the configuration (and just join the provided path), but we really should be returning an error (especially when you consider that the configuration userns mappings are used quite a bit in runc with the assumption that they are the correct mapping for the userns -- but in this case they are not). In the case of userns, the mappings are also required if you _do not_ specify a path, while in the case of the time namespace you can have a container with a timens but no mappings specified. It should be noted that the case checking that the user has not specified a userns path and a userns mapping needs to be handled in specconv (as opposed to the configuration validator) because with this patchset we now cache the mappings of path-based userns configurations and thus the validator can't be sure whether the mapping is a cached mapping or a user-specified one. So we do the validation in specconv, and thus the test for this needs to be an integration test. Signed-off-by: Aleksa Sarai --- libcontainer/configs/validate/validator.go | 12 +++++-- .../configs/validate/validator_test.go | 10 +++--- libcontainer/integration/exec_test.go | 6 ++++ libcontainer/specconv/spec_linux.go | 5 +++ libcontainer/specconv/spec_linux_test.go | 34 +++++++++++++++++++ 5 files changed, 61 insertions(+), 6 deletions(-) diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 4fbd308dadd..ece70a45d31 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error { func (v *ConfigValidator) usernamespace(config *configs.Config) error { if config.Namespaces.Contains(configs.NEWUSER) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - return errors.New("USER namespaces aren't enabled in the kernel") + return errors.New("user namespaces aren't enabled in the kernel") } + hasPath := config.Namespaces.PathOf(configs.NEWUSER) != "" + hasMappings := config.UidMappings != nil || config.GidMappings != nil + if !hasPath && !hasMappings { + return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified") + } + // The hasPath && hasMappings validation case is handled in specconv -- + // we cache the mappings in Config during specconv in the hasPath case, + // so we cannot do that validation here. } else { if config.UidMappings != nil || config.GidMappings != nil { - return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config") + return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config") } } return nil diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index 5181333fb12..e89a60b08c5 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -150,7 +150,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) { } } -func TestValidateUsernamespace(t *testing.T) { +func TestValidateUserNamespace(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { t.Skip("Test requires userns.") } @@ -161,6 +161,8 @@ func TestValidateUsernamespace(t *testing.T) { {Type: configs.NEWUSER}, }, ), + UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, + GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, } validator := New() @@ -170,11 +172,11 @@ func TestValidateUsernamespace(t *testing.T) { } } -func TestValidateUsernamespaceWithoutUserNS(t *testing.T) { - uidMap := configs.IDMap{ContainerID: 123} +func TestValidateUsernsMappingWithoutNamespace(t *testing.T) { config := &configs.Config{ Rootfs: "/var", - UidMappings: []configs.IDMap{uidMap}, + UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, + GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}}, } validator := New() diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 5c6272e5d42..670b33fcbb3 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -1588,6 +1589,11 @@ func TestInitJoinNetworkAndUser(t *testing.T) { config2 := newTemplateConfig(t, &tParam{userns: true}) config2.Namespaces.Add(configs.NEWNET, netns1) config2.Namespaces.Add(configs.NEWUSER, userns1) + // Emulate specconv.setupUserNamespace(). + uidMap, gidMap, err := userns.GetUserNamespaceMappings(userns1) + ok(t, err) + config2.UidMappings = uidMap + config2.GidMappings = gidMap config2.Cgroups.Path = "integration/test2" container2, err := newContainer(t, config2) ok(t, err) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index bef6a51df42..0cbf578112d 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -941,6 +941,11 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { } } if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // We cannot allow uid or gid mappings to be set if we are also asked + // to join a userns. + if config.UidMappings != nil || config.GidMappings != nil { + return errors.New("user namespaces enabled, but both namespace path and mapping specified -- you may only provide one") + } // Cache the current userns mappings in our configuration, so that we // can calculate uid and gid mappings within runc. These mappings are // never used for configuring the container if the path is set. diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index 79768e31861..68e57a08308 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -610,6 +610,40 @@ func TestDupNamespaces(t *testing.T) { } } +func TestUserNamespaceMappingAndPath(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("Test requires userns.") + } + + spec := &specs.Spec{ + Root: &specs.Root{ + Path: "rootfs", + }, + Linux: &specs.Linux{ + UIDMappings: []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000, Size: 1000}, + }, + GIDMappings: []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 2000, Size: 1000}, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "user", + Path: "/proc/1/ns/user", + }, + }, + }, + } + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + + if !strings.Contains(err.Error(), "user namespaces enabled, but both namespace path and mapping specified") { + t.Errorf("user namespace with mapping and namespace path should be forbidden") + } +} + func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { t.Skip("Test requires userns.") From 2dd8368e5737e47526bf2412e1160a73911046dd Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 23 Nov 2023 12:40:07 +1100 Subject: [PATCH 05/21] integration: add mega-test for joining namespaces (This is a cherry-pick of 6fa8d068438ed47e8318448c34fc4587612a0740.) Given we've had several bugs in this behaviour that have now been fixed, add an integration test that makes sure that you can start a container that joins all of the namespaces of a second container. The only namespace we do not join is the mount namespace, because joining a namespace that has been pivot_root'd leads to a bunch of errors. In principle, removing everything from config.json that requires a mount _should_ work, but the root.path configuration is mandatory and we cannot just ignore setting up the rootfs in the namespace joining scenario (if the user has configured a different rootfs, we need to use it or error out, and there's no reasonable way of checking if if the rootfs paths are the same that doesn't result in spaghetti logic). Signed-off-by: Aleksa Sarai --- tests/integration/run.bats | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/integration/run.bats b/tests/integration/run.bats index e59ccb4a43d..88540c763d1 100644 --- a/tests/integration/run.bats +++ b/tests/integration/run.bats @@ -108,3 +108,60 @@ function teardown() { [ "$status" -eq 0 ] [ "$output" = "410" ] } + +@test "runc run [joining existing container namespaces]" { + # Create a detached container with the namespaces we want. We notably want + # to include userns, which requires config-related configuration. + if [ $EUID -eq 0 ]; then + update_config '.linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 100}] + | .linux.gidMappings += [{"containerID": 0, "hostID": 200000, "size": 200}]' + mkdir -p rootfs/{proc,sys,tmp} + fi + update_config '.process.args = ["sleep", "infinity"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" target_ctr + [ "$status" -eq 0 ] + + # Modify our container's configuration such that it is just going to + # inherit all of the namespaces of the target container. + # + # NOTE: We cannot join the mount namespace of another container because of + # some quirks of the runtime-spec. In particular, we MUST pivot_root into + # root.path and root.path MUST be set in the config, so runc cannot just + # ignore root.path when joining namespaces (and root.path doesn't exist + # inside root.path, for obvious reasons). + # + # We could hack around this (create a copy of the rootfs inside the rootfs, + # or use a simpler mount namespace target), but those wouldn't be similar + # tests to the other namespace joining tests. + target_pid="$(__runc state target_ctr | jq .pid)" + update_config '.linux.namespaces |= map_values(.path = if .type == "mount" then "" else "/proc/'"$target_pid"'/ns/" + ({"network": "net", "mount": "mnt"}[.type] // .type) end)' + # Remove the userns configuration (it cannot be changed). + update_config '.linux |= (del(.uidMappings) | del(.gidMappings))' + + runc run -d --console-socket "$CONSOLE_SOCKET" attached_ctr + [ "$status" -eq 0 ] + + # Make sure there are two sleep processes in our container. + runc exec attached_ctr ps aux + [ "$status" -eq 0 ] + run -0 grep "sleep infinity" <<<"$output" + [ "${#lines[@]}" -eq 2 ] + + # ... that the userns mappings are the same... + runc exec attached_ctr cat /proc/self/uid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+100000\s+100$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi + runc exec attached_ctr cat /proc/self/gid_map + [ "$status" -eq 0 ] + if [ $EUID -eq 0 ]; then + grep -E '^\s+0\s+200000\s+200$' <<<"$output" + else + grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output" + fi +} From e65d4cac663aa4e720cf14ba1bfd3a8900801976 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Dec 2023 22:54:53 +1100 Subject: [PATCH 06/21] specconv: temporarily allow userns path and mapping if they match (This is a cherry-pick of ebcef3e651e61aeee96546301d8db9e92b505ce6.) It turns out that the error added in commit 09822c3da8ad ("configs: disallow ambiguous userns and timens configurations") causes issues with containerd and CRIO because they pass both userns mappings and a userns path. These configurations are broken, but to avoid the regression in this one case, output a warning to tell the user that the configuration is incorrect but we will continue to use it if and only if the configured mappings are identical to the mappings of the provided namespace. Fixes: 09822c3da8ad ("configs: disallow ambiguous userns and timens configurations") Signed-off-by: Aleksa Sarai --- libcontainer/specconv/spec_linux.go | 24 +++++++++++++++++++----- libcontainer/specconv/spec_linux_test.go | 4 ++-- libcontainer/userns/userns_maps_linux.go | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 0cbf578112d..e0bb528f39a 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -941,11 +941,6 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { } } if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { - // We cannot allow uid or gid mappings to be set if we are also asked - // to join a userns. - if config.UidMappings != nil || config.GidMappings != nil { - return errors.New("user namespaces enabled, but both namespace path and mapping specified -- you may only provide one") - } // Cache the current userns mappings in our configuration, so that we // can calculate uid and gid mappings within runc. These mappings are // never used for configuring the container if the path is set. @@ -953,6 +948,25 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { if err != nil { return fmt.Errorf("failed to cache mappings for userns: %w", err) } + // We cannot allow uid or gid mappings to be set if we are also asked + // to join a userns. + if config.UidMappings != nil || config.GidMappings != nil { + // FIXME: It turns out that containerd and CRIO pass both a userns + // path and the mappings of the namespace in the same config.json. + // Such a configuration is technically not valid, but we used to + // require mappings be specified, and thus users worked around our + // bug -- so we can't regress it at the moment. But we also don't + // want to produce broken behaviour if the mapping doesn't match + // the userns. So (for now) we output a warning if the actual + // userns mappings match the configuration, otherwise we return an + // error. + if !userns.IsSameMapping(uidMap, config.UidMappings) || + !userns.IsSameMapping(gidMap, config.GidMappings) { + return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one") + } + logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on if you see this warning and cannot update your configuration.") + } + config.UidMappings = uidMap config.GidMappings = gidMap logrus.WithFields(logrus.Fields{ diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index 68e57a08308..f79ea0e3727 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -639,8 +639,8 @@ func TestUserNamespaceMappingAndPath(t *testing.T) { Spec: spec, }) - if !strings.Contains(err.Error(), "user namespaces enabled, but both namespace path and mapping specified") { - t.Errorf("user namespace with mapping and namespace path should be forbidden") + if !strings.Contains(err.Error(), "both namespace path and non-matching mapping specified") { + t.Errorf("user namespace with path and non-matching mapping should be forbidden, got error %v", err) } } diff --git a/libcontainer/userns/userns_maps_linux.go b/libcontainer/userns/userns_maps_linux.go index d81290de2cb..7a8c2b023b3 100644 --- a/libcontainer/userns/userns_maps_linux.go +++ b/libcontainer/userns/userns_maps_linux.go @@ -169,3 +169,18 @@ func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, er return uidMap, gidMap, nil } + +// IsSameMapping returns whether or not the two id mappings are the same. Note +// that if the order of the mappings is different, or a mapping has been split, +// the mappings will be considered different. +func IsSameMapping(a, b []configs.IDMap) bool { + if len(a) != len(b) { + return false + } + for idx := range a { + if a[idx] != b[idx] { + return false + } + } + return true +} From 617db78515a5fbaef9038f43ac0da5aa09adee88 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 8 Dec 2023 11:07:51 +1100 Subject: [PATCH 07/21] configs: make id mappings int64 to better handle 32-bit (This is a cherry-pick of 482e56379a1c2f8b2573f07c06013510e59f9eb8.) Using ints for all of our mapping structures means that a 32-bit binary errors out when trying to parse /proc/self/*id_map: failed to cache mappings for userns: failed to parse uid_map of userns /proc/1/ns/user: parsing id map failed: invalid format in line " 0 0 4294967295": integer overflow on token 4294967295 This issue was unearthed by commit 1912d5988bbb ("*: actually support joining a userns with a new container") but the underlying issue has been present since the docker/libcontainer days. In theory, switching to uint32 (to match the spec) instead of int64 would also work, but keeping everything signed seems much less error-prone. It's also important to note that a mapping might be too large for an int on 32-bit, so we detect this during the mapping. Signed-off-by: Aleksa Sarai --- libcontainer/configs/config.go | 6 +++--- libcontainer/configs/config_linux.go | 30 ++++++++++++++++++++++------ libcontainer/container_linux.go | 2 +- libcontainer/specconv/spec_linux.go | 6 +++--- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index c1b4a0041c2..6ebf5ec7b60 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -21,9 +21,9 @@ type Rlimit struct { // IDMap represents UID/GID Mappings for User Namespaces. type IDMap struct { - ContainerID int `json:"container_id"` - HostID int `json:"host_id"` - Size int `json:"size"` + ContainerID int64 `json:"container_id"` + HostID int64 `json:"host_id"` + Size int64 `json:"size"` } // Seccomp represents syscall restrictions diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go index 8c02848b70a..51fe940748a 100644 --- a/libcontainer/configs/config_linux.go +++ b/libcontainer/configs/config_linux.go @@ -1,6 +1,10 @@ package configs -import "errors" +import ( + "errors" + "fmt" + "math" +) var ( errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") @@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) { if c.UidMappings == nil { return -1, errNoUIDMap } - id, found := c.hostIDFromMapping(containerId, c.UidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings) if !found { return -1, errNoUserMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setuid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) { if c.GidMappings == nil { return -1, errNoGIDMap } - id, found := c.hostIDFromMapping(containerId, c.GidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings) if !found { return -1, errNoGroupMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setgid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) { // Utility function that gets a host ID for a container ID from user namespace map // if that ID is present in the map. -func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { +func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) { for _, m := range uMap { if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { hostID := m.HostID + (containerID - m.ContainerID) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 2f17e37d43d..59aa0338ac6 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -2268,7 +2268,7 @@ func ignoreTerminateErrors(err error) error { func requiresRootOrMappingTool(c *configs.Config) bool { gidMap := []configs.IDMap{ - {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1}, } return !reflect.DeepEqual(c.GidMappings, gidMap) } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index e0bb528f39a..7dbfb8691d0 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -927,9 +927,9 @@ next: func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { create := func(m specs.LinuxIDMapping) configs.IDMap { return configs.IDMap{ - HostID: int(m.HostID), - ContainerID: int(m.ContainerID), - Size: int(m.Size), + HostID: int64(m.HostID), + ContainerID: int64(m.ContainerID), + Size: int64(m.Size), } } if spec.Linux != nil { From 4bccb38cc9cf198d52bebf2b3a90cd14e7af8c06 Mon Sep 17 00:00:00 2001 From: lfbzhm Date: Mon, 1 Jan 2024 15:21:45 +0000 Subject: [PATCH 08/21] VERSION: release 1.1.11 Signed-off-by: lfbzhm Signed-off-by: Aleksa Sarai --- CHANGELOG.md | 24 +++++++++++++++++++----- VERSION | 2 +- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13d2ea0b4bb..7452f70faf0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased 1.1.z] +## [1.1.11] - 2024-01-01 + +> Happy New Year! + +### Fixed + +* Fix several issues with userns path handling. (#4122, #4124, #4134, #4144) + +### Changed + + * Support memory.peak and memory.swap.peak in cgroups v2. + Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage. + For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage + from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage` + are set. (#4000, #4010, #4131) + * build(deps): bump github.com/cyphar/filepath-securejoin. (#4140) + ## [1.1.10] - 2023-10-31 > Śruba, przykręcona we śnie, nie zmieni sytuacji, jaka panuje na jawie. @@ -46,10 +63,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This aligns cgroupv2 root usage more closely with cgroupv1 reporting. Additionally, report root swap usage as sum of swap and memory usage, aligned with v1 and existing non-root v2 reporting. (#3933) - * Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage. - For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage - from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage` - are set. (#4010) ## [1.1.8] - 2023-07-20 @@ -480,7 +493,8 @@ implementation (libcontainer) is *not* covered by this policy. [1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1 -[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.10...release-1.1 +[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.11...release-1.1 +[1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11 [1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10 [1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9 [1.1.8]: https://github.com/opencontainers/runc/compare/v1.1.7...v1.1.8 diff --git a/VERSION b/VERSION index f5e7ca832ca..9ee1f786d50 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.10+dev +1.1.11 From 7887736f332d97a6d8729b23e8ac7faee96cced9 Mon Sep 17 00:00:00 2001 From: lfbzhm Date: Mon, 1 Jan 2024 15:23:00 +0000 Subject: [PATCH 09/21] VERSION: back to development Signed-off-by: lfbzhm Signed-off-by: Aleksa Sarai --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 9ee1f786d50..5aafe9bab6b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.11 +1.1.11+dev From d561e5da874134985dd4b9a7af064dc9626d83f1 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 22 Jan 2024 15:52:14 +1100 Subject: [PATCH 10/21] keyring: update cyphar@cyphar.com key expiry Signed-off-by: Aleksa Sarai --- runc.keyring | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/runc.keyring b/runc.keyring index 84b4ca38a1e..ab0b5a151d5 100644 --- a/runc.keyring +++ b/runc.keyring @@ -72,18 +72,18 @@ KWr9ByCKAwVHsaSgVSJE/dse4f1toqeEHHbWk682U4RqOWZR4bA0 pub ed25519 2019-06-21 [C] C9C370B246B09F6DBCFC744C34401015D1D2D386 uid [ultimate] Aleksa Sarai -sub ed25519 2022-09-30 [S] [expires: 2024-09-29] -sub cv25519 2022-09-30 [E] [expires: 2024-09-29] -sub ed25519 2022-09-30 [A] [expires: 2024-09-29] +sub ed25519 2022-09-30 [S] [expires: 2030-03-25] +sub cv25519 2022-09-30 [E] [expires: 2030-03-25] +sub ed25519 2022-09-30 [A] [expires: 2030-03-25] -----BEGIN PGP PUBLIC KEY BLOCK----- Comment: github=cyphar mDMEXQxvLxYJKwYBBAHaRw8BAQdArRQoZs9YzYtQIiPA1qdvUT8Q0wbPZyRV65Tz QNTIZla0IEFsZWtzYSBTYXJhaSA8Y3lwaGFyQGN5cGhhci5jb20+iJAEExYIADgF -CwkIBwIGFQoJCAsCBBYCAwECHgECF4AWIQTJw3CyRrCfbbz8dEw0QBAV0dLThgUC -XQzCHwIbAQAKCRA0QBAV0dLThvUpAP9SwyOijLqEBz1A9pTqRAB0l/r+ABq+iUmH -UjMHO34LZAD/biRuAadaxIYJtmn7nKA55doyN2fQXhjArqypJ1SQywi4MwRdDMJS +CwkIBwIGFQoJCAsCBBYCAwECHgECF4ACGwEWIQTJw3CyRrCfbbz8dEw0QBAV0dLT +hgUCZa3xwQAKCRA0QBAV0dLThpQyAQDGzjZyyWWmd6Ykg5/lymp2MLIg1f2jG6ew +AiPT4ATkBAD/RgdLDf1IQStEH7pHmQa1qvqyRq1jeEgF23KruXbbdQ64MwRdDMJS FgkrBgEEAdpHDwEBB0B2IGusH7LuDH3hNT6JYM30S7G92FGogA6a9WQzKRlqvIh4 BCgWCgAgFiEEycNwskawn228/HRMNEAQFdHS04YFAmM2ukUCHQEACgkQNEAQFdHS 04ZTQAEAjAT0fXVJHdRL6UMCxDYsgjG+QyH1mr7gKgbPvB8A5LgBAN4QDqCxIY3b @@ -106,20 +106,20 @@ o7lcWozXFlQDOM7eoT4avvWOVcsaD4h4BBgWCAAgFiEEycNwskawn228/HRMNEAQ FdHS04YFAl0Mwo0CGyAACgkQNEAQFdHS04ajxQEAsZf1yDORUVYicREc/7z0U+51 DJzeAexeJTYM+N+x13EA/0Ex+o7qQ7dZLGDn7x4LSbd39C+++suHsEaE4XwlX6cH uDMEYza6SxYJKwYBBAHaRw8BAQdAE3s7dZQFuImQX2tWshIdGjeUKZc7rlMcrZ6+ -q25gaH2I9QQYFgoAJhYhBMnDcLJGsJ9tvPx0TDRAEBXR0tOGBQJjNrpLAhsCBQkD -wmcAAIEJEDRAEBXR0tOGdiAEGRYKAB0WIQS2TklVsp+j1GPyqQYol/rSt+lEbwUC -Yza6SwAKCRAol/rSt+lEb9obAQC8ij4yJTU7ZcAtTx2ZMjj8EoruGb3ku6VpRyx1 -+pyQQgD/QgQ7X1G7xtwuVpY0kHYga1yoKLA2ycT8F8PrVtF7pAMWkgD9EWe1E77C -BVd//i3ib+h9ikCeJ+gaxc6aU24ZBcN2tfUBAJmCmYQ0VEbXyvCqkdJEQ4qk5Y9C -2V4w83dj4a5RYKUGuDgEYza6YBIKKwYBBAGXVQEFAQEHQKECW5Y7nUGCka0/WcCM -OerRY95Pm2DQVL76QzvhXD8tAwEIB4h+BBgWCgAmFiEEycNwskawn228/HRMNEAQ -FdHS04YFAmM2umACGwwFCQPCZwAACgkQNEAQFdHS04bkuwEA7AEL+iSPlA8/YILp -0sFMzmtRqTDMqx2BY8K5wEk9fusA/jAhbeJw57bZYvK4MghfUa9tRocyII84UmOA -cgDbPPIFuDMEYza6bhYJKwYBBAHaRw8BAQdAgHXd0yf6MPXJZCZ3TFz8xLymyPsD -TF2SQwwqM4+nYbeIfgQYFgoAJhYhBMnDcLJGsJ9tvPx0TDRAEBXR0tOGBQJjNrpu -AhsgBQkDwmcAAAoJEDRAEBXR0tOGB8UA/0wf8uECKMmXGQ4DNi+ei2E9Ft6GL8qw -UGjwM/EKH2RoAP9HNRRKBjDxs/AZ3pBx1Q8hnHELLo0kXPc+3BG6Pht5BA== -=KN4V +q25gaH2I9QQYFgoAJgIbAhYhBMnDcLJGsJ9tvPx0TDRAEBXR0tOGBQJlrfJcBQkO +EpjFAIF2IAQZFgoAHRYhBLZOSVWyn6PUY/KpBiiX+tK36URvBQJjNrpLAAoJECiX ++tK36URv2hsBALyKPjIlNTtlwC1PHZkyOPwSiu4ZveS7pWlHLHX6nJBCAP9CBDtf +UbvG3C5WljSQdiBrXKgosDbJxPwXw+tW0XukAwkQNEAQFdHS04bMkQEA9elVwA0A ++ywDw+jnifIc98XqLI+KF3Xl0A9+lMuwthMBAO00DeAEjkryFMGp62GPNHqr/r6p ++6DIeUjWgK4Sh8IMuDgEYza6YBIKKwYBBAGXVQEFAQEHQKECW5Y7nUGCka0/WcCM +OerRY95Pm2DQVL76QzvhXD8tAwEIB4h+BBgWCgAmAhsMFiEEycNwskawn228/HRM +NEAQFdHS04YFAmWt8lwFCQ4SmLAACgkQNEAQFdHS04apHgD+MIRj2kujpxtQt04D +ZB+hofBtHIEMo2tplFBYvhZ6KOMA/1q3aRv6jnWAv8woc50KitP4/+iPmfyzaBA/ +8XA5DdIKuDMEYza6bhYJKwYBBAHaRw8BAQdAgHXd0yf6MPXJZCZ3TFz8xLymyPsD +TF2SQwwqM4+nYbeIfgQYFgoAJgIbIBYhBMnDcLJGsJ9tvPx0TDRAEBXR0tOGBQJl +rfJcBQkOEpiiAAoJEDRAEBXR0tOGAUwA/jbaz04OXnV3PYC/yQUsUJsihCTqz4Ne +lxxclgJYU604APsFzpoLD0oUlfMn5Fh75ftkKPrwiHpTj4rRU6oIQu1/Bg== +=Ab7w -----END PGP PUBLIC KEY BLOCK----- pub rsa2048 2020-04-28 [SC] [expires: 2025-04-18] From d0b1a374401fb97af53c12a4efab03113375aeea Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 22 Jan 2024 16:14:17 +1100 Subject: [PATCH 11/21] keyring: update AkihiroSuda key expiry Signed-off-by: Aleksa Sarai --- runc.keyring | 64 ++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/runc.keyring b/runc.keyring index ab0b5a151d5..3fb6d283636 100644 --- a/runc.keyring +++ b/runc.keyring @@ -159,11 +159,11 @@ S1NPMeS7+G/gPN9Ze9qFmOF2p57cmEa+8mriZCYY3BcUBOiMOV5HSBKJwqA2M8au =GkpD -----END PGP PUBLIC KEY BLOCK----- -pub rsa3072 2019-07-25 [SC] [expires: 2023-11-02] +pub rsa3072 2019-07-25 [SC] [expires: 2025-07-27] C020EA876CE4E06C7AB95AEF49524C6F9F638F1A uid [ultimate] Akihiro Suda uid [ultimate] Akihiro Suda -sub rsa3072 2019-07-25 [E] [expires: 2023-11-02] +sub rsa3072 2019-07-25 [E] [expires: 2025-07-27] -----BEGIN PGP PUBLIC KEY BLOCK----- Comment: github=AkihiroSuda @@ -178,26 +178,26 @@ dsyq4W565jNRV/HWRUMR+LDIS1KiEalzDoID3aUXRHHLUQG0oqX8jqFJUqp1P9pO 9nezuUDg8SsaBg8O4tyv/CZq/FeF3RMMc2EHTiO8HTERqmRMxUFZv3bkgA4GnjnA 3wsZhLXQq+UaIJUAEQEAAbQsQWtpaGlybyBTdWRhIDxha2loaXJvLnN1ZGEuY3pA aGNvLm50dC5jby5qcD6JAdQEEwEKAD4CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgEC -F4AWIQTAIOqHbOTgbHq5Wu9JUkxvn2OPGgUCYYDT5gUJCAkhxwAKCRBJUkxvn2OP -GiHnC/wOqAvEcRmpKjqx4QUNkE34oGwiPgV5vyDlQElvBzyazQEcIdt9xaIE+4IS -7L7L6Q7WOGxWCvmRZ58E32m4RB1F8L7XQW0l3f6jESYLGPb6XDloux5poJzGxaGK -9gd6ItNmjOCmt08Icv0ZVTvKv20ej71aepllE5UaM9p5AlEwLkzQxPoGpB7E1Sdy -citRg6YEqTY+i5IeZ5xMthWXcushyLRRvm43DwbPsuZHVC1yMfo5VrF9JE65BdE9 -dIsCrZDnde/jUm4pAAwyAKSLLRVgj4xVP0XIdO2nVXDBWp9z4gUt/gMjuutO1a2U -Xw+XhkirUb2C++L0KvVBMbU303Q+xV/iaYjAuFjNy94HZms0iTBTB4qFHT4ClYHi -mNwTgfwRclpywkHzDi8496hsyzoVCeHSsu+ScDE1qAw6zrxASZXevYhhB2aBLr1s -d58WsYA37iXTEO4Hxm5V0Wh110hlCGFwcN8vWNhMCdIj7JN8nWZQNLZyppN7bCDu -FX8cE260I0FraWhpcm8gU3VkYSA8c3VkYS5reW90b0BnbWFpbC5jb20+iQHUBBMB +F4AWIQTAIOqHbOTgbHq5Wu9JUkxvn2OPGgUCZMPL2QUJC0wZugAKCRBJUkxvn2OP +GqTiC/93jTl0ci2zWC8vVBPSyjHDrpOhn+3ukCeC7VxHOdo6hBwbsxqaBUWi0Maf +p9oa4HzmsQjhMM+i3/Q/jHBvijXQ2UO5MaDrLhacoAW8i/YeU2aKn2yIyrQPIdc/ +tlcwjvsRPt534DOisf1N5+w6Y4DRgt2tNl0KOjEBmXsBWN7Fg+QRfLeNWKS9soq7 +QkI68T0e0h752FmI8TK4yy6FrhLVUU2ArLcOV2wjx5zKnWjgX7BbwYjAp8fi9hcC +XdmSvllQ8U9Y2ll8dDq3HBmo+uI4lfz31S4B5EKo4Wn+3bA4Y+VBNoJfoKyLeOgr +0cmo6SRJIsVaSvAJcMZ6oq+jvTDuygfRkxxgoTzCgwre7CPzcvC8gC0sYOB34TN4 +UogwN3pFmCPfi5TjXsx7vgfWKlHgwe3L/5aoQjTm+z6WanTHbIqOK9QkIuGykMpL +7nOJeH9LoRzpzc8aOwIOki2bbo7s9yzL8Gil+zaqe16Q+Y7wVBxSRxbg/3oUTi1K +/uM8N4S0I0FraWhpcm8gU3VkYSA8c3VkYS5reW90b0BnbWFpbC5jb20+iQHUBBMB CgA+AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEwCDqh2zk4Gx6uVrvSVJM -b59jjxoFAmGA0+YFCQgJIccACgkQSVJMb59jjxoMJwwAgZxXa8DPoUWeazt5TIVX -omVcsor2J75CqPKlOjvSVXSnCzkBM1kYN2RwVjNivuIEUWPDOohvUvJxllkm7dxd -g+XfLL3/luB4B+R06n78339K0pu4+n5eDIF0UiNbfuGocqFtVBXuC0uj7ZWPJnZe -tdbspisggJ8Q2Im7mQPQRQZ1Q1qBlogxpeeDzyGkrLRusryfd8LwPz7/8I59pkwG -hkNm0+JbaDJ1NtFElX+XvPaOxfCB3ut94CUjac0DdkQNDX+i2ruZNAsIjEuxQbuT -UAc1ouv+R126SBqVdkRLtRw+d0DmAR7PiL37C8KjQa6s+H46jzhLDQ0a3frZdo2w -c1Sony8C60w9q8wpGjJjjelTimsEW8aa7e17xMVgZrawAOAPDuGvbRMGl6fla9T2 -ZYTF6QDzoeqB4VgL441yJm0c2/c6L8gz8ehCNGyqxtfFX+8OO4W3+p4a/mKP8MLz -9l04g71QkuAi3bF7bbrsWmagMXJJJWTHbizDLaytI/6nuQGNBF06GR8BDACxpQ9c +b59jjxoFAmTDy9kFCQtMGboACgkQSVJMb59jjxogzgv/a+4+T5Xoklt0rGujSgtD +ogpQp4guaImEhkPieWMPG7+UfqxwoMLcvLE5kTzqLPe1DdYs8Tm/gtteHttLUfjD +qwY/+BsqIYYMJMRoXFBk2iokn0m/36da7WKpN+5r5ssujsvGj991k4oLQgFV0kEx +f4PSRxWQNlAqp4OfQNI91S7oMDH94dR+V5TIYYHxsPsnCvygD72GVER4G5mUvkCH +Nf8aqeckVxu8uZ/2LiNtYxbh5pwriuj8XbifuawdMdjpTvwAAa2DuKqCtj9cuQIt +hmOF1ux68TRxk//QGPqX49+WT0mwdHBX/I/nZVTOGt9sjjKU5m1o+rUiVHtQ3Yhw +fSLWEbfZiTjWDPWpjLU+r3C2qCiJyPjNpsxYAp4y3v511BXesejcXm24+MHFym5F +ngyAItzwDD9ieTt3uviuC64VZVz7NgnDMUK0LumKh9mrZZ20dTcX9Vw70o41CMQN +yBKloXOSPzQDZp1ZXzR3P/22WXG/e52YuU3Aw1femld+uQGNBF06GR8BDACxpQ9c y72+/WZGon+CToNj+a24PiduyExfFv26E0D77ACS6UAC5jz71mSuLbHiauQ3MHj+ 786z4m4St8+HjDL9YrAe19MobxWsLHAFvBJ8UHfZdkLzBkIKPHz7TUqlhvFR13b6 ZAZVZk975hgCT3LpzA1miHBY2E5WDpVa3pe94xshVHL3iVf9Jv1a4hmM+eu0gxX4 @@ -206,16 +206,16 @@ Qf6sg0b/k6/vkVveopeeH28zb/nnVuhgGSxcbiZUrFC9EfhX4/6NNFRhE300AjeF bP7SoXx3qRhr993BDSP32r44hy+kYLhZP5K5oXivcITJZuGcJh49P4QuYGrnODIL gEhedWeePcJXFcEz09teizlWKGzd+EA3uwYd/bQelflwXkGuCLaoNv4qcH3oJDp1 vYI0zT7hGvnz3thRLg3SOWFq5cBhnfNGXPLsoNZBzWGn2cm5MJYSKjIM470AEQEA -AYkBvAQYAQoAJgIbDBYhBMAg6ods5OBserla70lSTG+fY48aBQJhgNRTBQkICSI0 -AAoJEElSTG+fY48a3YML/3snhGBx/Xd0EcK0pzyvyivZwavlGsQPAF2c1Rj7Lr1i -eUrp6CZ/yW7/oAvlk6Ngc0SoWba/pgnz7bVQEc21JTY86M1bRLLh3fmYCx8YFbsR -43zVr2bxDledzKV3bIuWStWbljHECuNTT91907pc3r4jv+jN4ZaXVUQ9pXj0DrV+ -MTJVCo7nrEXiq6q1WqaUAV9dMQE3rWGFa2u45QCZGLckOu3cuSCU8CVxSScmxgII -bUBu17xDzQnDkdcEQzzkZtDOrwF76dPdlrW69PXtC9oElRJbGCERivqlrpKDagXI -h4eZYfcFb2gc0qZjblvfVHiot65WM9bUsSAUAEfskYqIGLshzV9MrxFYQYvgt3ym -Qs7D8ORJiphjaOvDeqVyGdPm/rN5SVMVGYpJX6EkZkHinV/kRChtuLAD7NQ3YH5O -5l+Ehze9Nm4laEXQC/tme9B1XH0PUBJk1x8NeoVrYCTnypVFfRw37mC9XBu5TF6U -ix7vx45U/EvZrqmkDrEFOQ== -=4+1P +AYkBvAQYAQoAJgIbDBYhBMAg6ods5OBserla70lSTG+fY48aBQJkw8uyBQkLTBmT +AAoJEElSTG+fY48ayhsL+gLvKlfkYgxodyWKR5hOiUMKWE5tqfQY6kqrgssPYw+u +Fn69AamQLt4I2AHRg0AHjoZEsMfR19uXZ24XwwcWwgWU6yRJgMSIK67bLvL+d686 +m2KQ2PpmfDrizUgY4J0sY+tzwNZeWxQiFy/Ni6AdEqJvJQDsrKYJ2GGWm6JMZCPw +y3h5ouueieiEc0pvwEz2kg64uv6p8SUV1me66IXQaGseXb/BcW+Ap2WJO+IZjtNB +qhk+V+1x5ZT6s9RecjiTDmKfZ71zyRWplkfL22+4XVEc3qLS3r0ZSzeIA4JPRf+N +yCGjavdTNgu2bTo8iSgBq2NRT9kNwTaS8j883L0eY/JJktrfWnWE4qAuXBqLzkIl +smspRWy0byLQrrzk9stncF/CDt5XuHPcsXOcRVXVyM+/RXqWKdNAwZO67HD4wJR9 +YR4avhGZZXguH3b0ka2zO8sxTju/09yb07NJ2qfjfWSHCmaj9KuhhE0EO625tckS +58ceqolNBtrydoYZOc2CKw== +=ol6W -----END PGP PUBLIC KEY BLOCK----- From 506552a88bd3455e80a9b3829568e94ec0160309 Mon Sep 17 00:00:00 2001 From: "hang.jiang" Date: Fri, 1 Sep 2023 16:17:13 +0800 Subject: [PATCH 12/21] Fix File to Close (This is a cherry-pick of 937ca107c3d22da77eb8e8030f2342253b980980.) Signed-off-by: hang.jiang Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/paths.go | 1 + update.go | 1 + 2 files changed, 2 insertions(+) diff --git a/libcontainer/cgroups/fs/paths.go b/libcontainer/cgroups/fs/paths.go index 1092331b25d..2cb970a3d55 100644 --- a/libcontainer/cgroups/fs/paths.go +++ b/libcontainer/cgroups/fs/paths.go @@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string { if err != nil { return "" } + defer dir.Close() names, err := dir.Readdirnames(1) if err != nil { return "" diff --git a/update.go b/update.go index 9ce5a2e835b..6d582ddddec 100644 --- a/update.go +++ b/update.go @@ -174,6 +174,7 @@ other options are ignored. if err != nil { return err } + defer f.Close() } err = json.NewDecoder(f).Decode(&r) if err != nil { From 0994249a5ec4e363bfcf9af58a87a722e9a3a31b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Dec 2023 23:53:07 +1100 Subject: [PATCH 13/21] init: verify after chdir that cwd is inside the container If a file descriptor of a directory in the host's mount namespace is leaked to runc init, a malicious config.json could use /proc/self/fd/... as a working directory to allow for host filesystem access after the container runs. This can also be exploited by a container process if it knows that an administrator will use "runc exec --cwd" and the target --cwd (the attacker can change that cwd to be a symlink pointing to /proc/self/fd/... and wait for the process to exec and then snoop on /proc/$pid/cwd to get access to the host). The former issue can lead to a critical vulnerability in Docker and Kubernetes, while the latter is a container breakout. We can (ab)use the fact that getcwd(2) on Linux detects this exact case, and getcwd(3) and Go's Getwd() return an error as a result. Thus, if we just do os.Getwd() after chdir we can easily detect this case and error out. In runc 1.1, a /sys/fs/cgroup handle happens to be leaked to "runc init", making this exploitable. On runc main it just so happens that the leaked /sys/fs/cgroup gets clobbered and thus this is only consistently exploitable for runc 1.1. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Co-developed-by: lifubang Signed-off-by: lifubang [refactored the implementation and added more comments] Signed-off-by: Aleksa Sarai --- libcontainer/init_linux.go | 31 ++++++++++++++++++++++++ libcontainer/integration/seccomp_test.go | 20 +++++++-------- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 5b88c71fc83..d9f18139f54 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -8,6 +8,7 @@ import ( "io" "net" "os" + "path/filepath" "strings" "unsafe" @@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error { return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error { return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) } } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return fmt.Errorf("unable to clear keep caps: %w", err) } diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go index 31092a0a5d2..ecdfa7957df 100644 --- a/libcontainer/integration/seccomp_test.go +++ b/libcontainer/integration/seccomp_test.go @@ -13,7 +13,7 @@ import ( libseccomp "github.com/seccomp/libseccomp-golang" ) -func TestSeccompDenyGetcwdWithErrno(t *testing.T) { +func TestSeccompDenySyslogWithErrno(t *testing.T) { if testing.Short() { return } @@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, ErrnoRet: &errnoRet, }, @@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: No such process" + expected := "dmesg: klogctl: No such process" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) } } -func TestSeccompDenyGetcwd(t *testing.T) { +func TestSeccompDenySyslog(t *testing.T) { if testing.Short() { return } @@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", + Name: "syslog", Action: configs.Errno, }, }, @@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) { buffers := newStdBuffers() pwd := &libcontainer.Process{ Cwd: "/", - Args: []string{"pwd"}, + Args: []string{"dmesg"}, Env: standardEnvironment, Stdin: buffers.Stdin, Stdout: buffers.Stdout, @@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) { } if exitCode == 0 { - t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode) } - expected := "pwd: getcwd: Operation not permitted" + expected := "dmesg: klogctl: Operation not permitted" actual := strings.Trim(buffers.Stderr.String(), "\n") if actual != expected { t.Fatalf("Expected output %s but got %s\n", expected, actual) From fbe3eed1e568a376f371d2ced1b4ac16b7d7adde Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 5 Jan 2024 01:42:32 +1100 Subject: [PATCH 14/21] setns init: do explicit lookup of execve argument early (This is a partial backport of a minor change included in commit dac41717465462b21fab5b5942fe4cb3f47d7e53.) This mirrors the logic in standard_init_linux.go, and also ensures that we do not call exec.LookPath in the final execve step. While this is okay for regular binaries, it seems exec.LookPath calls os.Getenv which tries to emit a log entry to the test harness when running in "go test" mode. In a future patch (in order to fix CVE-2024-21626), we will close all of the file descriptors immediately before execve, which would mean the file descriptor for test harness logging would be closed at execve time. So, moving exec.LookPath earlier is necessary. Ref: dac417174654 ("runc-dmz: reduce memfd binary cloning cost with small C binary") Signed-off-by: Aleksa Sarai --- libcontainer/setns_init_linux.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 09ab552b3d1..e891773ec57 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "os/exec" "strconv" "github.com/opencontainers/selinux/go-selinux" @@ -82,6 +83,21 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} + } + // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -101,5 +117,5 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + return system.Exec(name, l.config.Args[0:], os.Environ()) } From 284ba3057e428f8d6c7afcc3b0ac752e525957df Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 2 Jan 2024 14:58:28 +1100 Subject: [PATCH 15/21] init: close internal fds before execve If we leak a file descriptor referencing the host filesystem, an attacker could use a /proc/self/fd magic-link as the source for execve to execute a host binary in the container. This would allow the binary itself (or a process inside the container in the 'runc exec' case) to write to a host binary, leading to a container escape. The simple solution is to make sure we close all file descriptors immediately before the execve(2) step. Doing this earlier can lead to very serious issues in Go (as file descriptors can be reused, any (*os.File) reference could start silently operating on a different file) so we have to do it as late as possible. Unfortunately, there are some Go runtime file descriptors that we must not close (otherwise the Go scheduler panics randomly). The only way of being sure which file descriptors cannot be closed is to sneakily go:linkname the runtime internal "internal/poll.IsPollDescriptor" function. This is almost certainly not recommended but there isn't any other way to be absolutely sure, while also closing any other possible files. In addition, we can keep the logrus forwarding logfd open because you cannot execve a pipe and the contents of the pipe are so restricted (JSON-encoded in a format we pick) that it seems unlikely you could even construct shellcode. Closing the logfd causes issues if there is an error returned from execve. In mainline runc, runc-dmz protects us against this attack because the intermediate execve(2) closes all of the O_CLOEXEC internal runc file descriptors and thus runc-dmz cannot access them to attack the host. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/logs/logs.go | 9 ++++ libcontainer/setns_init_linux.go | 19 ++++++++ libcontainer/standard_init_linux.go | 19 ++++++++ libcontainer/utils/utils_unix.go | 72 +++++++++++++++++++++++++---- 4 files changed, 111 insertions(+), 8 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 95deb0d6ca7..349a18ed383 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,10 +4,19 @@ import ( "bufio" "encoding/json" "io" + "os" "github.com/sirupsen/logrus" ) +// IsLogrusFd returns whether the provided fd matches the one that logrus is +// currently outputting to. This should only ever be called by UnsafeCloseFrom +// from `runc init`. +func IsLogrusFd(fd uintptr) bool { + file, ok := logrus.StandardLogger().Out.(*os.File) + return ok && file.Fd() == fd +} + func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index e891773ec57..d1bb12273c0 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -117,5 +118,23 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c09a7bed30e..d1d94352f93 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -258,5 +259,23 @@ func (l *linuxStandardInit) Init() error { return err } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 220d0b43937..842f9b0a6d2 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -7,8 +7,11 @@ import ( "fmt" "os" "strconv" + _ "unsafe" // for go:linkname "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/logs" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. @@ -23,9 +26,11 @@ func EnsureProcHandle(fh *os.File) error { return nil } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { fdDir, err := os.Open("/proc/self/fd") if err != nil { return err @@ -50,15 +55,66 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We must not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + if logs.IsLogrusFd(uintptr(fd)) { + // Do not close the logrus output fd. We cannot exec a pipe, and + // the contents are quite limited (very little attacker control, + // JSON-encoded) making shellcode attacks unlikely. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new unix socket pair func NewSockPair(name string) (parent *os.File, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) From b6633f48a8c970433737b9be5bfe4f25d58a5aa7 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Dec 2023 23:36:51 +1100 Subject: [PATCH 16/21] cgroup: plug leaks of /sys/fs/cgroup handle We auto-close this file descriptor in the final exec step, but it's probably a good idea to not possibly leak the file descriptor to "runc init" (we've had issues like this in the past) especially since it is a directory handle from the host mount namespace. In practice, on runc 1.1 this does leak to "runc init" but on main the handle has a low enough file descriptor that it gets clobbered by the ForkExec of "runc init". OPEN_TREE_CLONE would let us protect this handle even further, but the performance impact of creating an anonymous mount namespace is probably not worth it. Also, switch to using an *os.File for the handle so if it goes out of scope during setup (i.e. an error occurs during setup) it will get cleaned up by the GC. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/file.go | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index 48b263a1661..f6e1b73bd92 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -77,16 +77,16 @@ var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} @@ -97,15 +97,16 @@ func prepareOpenat2() error { } return } + file := os.NewFile(uintptr(fd), cgroupfsDir) + var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } - cgroupFd = fd - + cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks @@ -132,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { return openFallback(path, flags, mode) } - fd, err := unix.Openat2(cgroupFd, relPath, + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, @@ -140,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) { }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir + // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) + // to reopen cgroupRootHandle and retry openat2. + fdStr := strconv.Itoa(int(cgroupRootHandle.Fd())) fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd + // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } From 683ad2ff3b01fb142ece7a8b3829de17150cf688 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 28 Dec 2023 00:42:23 +1100 Subject: [PATCH 17/21] libcontainer: mark all non-stdio fds O_CLOEXEC before spawning init Given the core issue in GHSA-xr7r-f8xq-vfvv was that we were unknowingly leaking file descriptors to "runc init", it seems prudent to make sure we proactively prevent this in the future. The solution is to simply mark all non-stdio file descriptors as O_CLOEXEC before we spawn "runc init". For libcontainer library users, this could result in unrelated files being marked as O_CLOEXEC -- however (for the same reason we are doing this for runc), for security reasons those files should've been marked as O_CLOEXEC anyway. Fixes: GHSA-xr7r-f8xq-vfvv CVE-2024-21626 Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 59aa0338ac6..40b332f9810 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -353,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) { }() } + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } From e9665f4d606b64bf9c4652ab2510da368bfbd951 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 20 Jan 2024 16:56:38 +1100 Subject: [PATCH 18/21] init: don't special-case logrus fds We close the logfd before execve so there's no need to special case it. In addition, it turns out that (*os.File).Fd() doesn't handle the case where the file was closed and so it seems suspect to use that kind of check. Signed-off-by: Aleksa Sarai --- libcontainer/logs/logs.go | 9 --------- libcontainer/utils/utils_unix.go | 8 -------- 2 files changed, 17 deletions(-) diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 349a18ed383..95deb0d6ca7 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,19 +4,10 @@ import ( "bufio" "encoding/json" "io" - "os" "github.com/sirupsen/logrus" ) -// IsLogrusFd returns whether the provided fd matches the one that logrus is -// currently outputting to. This should only ever be called by UnsafeCloseFrom -// from `runc init`. -func IsLogrusFd(fd uintptr) bool { - file, ok := logrus.StandardLogger().Out.(*os.File) - return ok && file.Fd() == fd -} - func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 842f9b0a6d2..bf3237a2911 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -10,8 +10,6 @@ import ( _ "unsafe" // for go:linkname "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/logs" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. @@ -102,12 +100,6 @@ func UnsafeCloseFrom(minFd int) error { // don't have any choice. return } - if logs.IsLogrusFd(uintptr(fd)) { - // Do not close the logrus output fd. We cannot exec a pipe, and - // the contents are quite limited (very little attacker control, - // JSON-encoded) making shellcode attacks unlikely. - return - } // There's nothing we can do about errors from close(2), and the // only likely error to be seen is EBADF which indicates the fd was // already closed (in which case, we got what we wanted). From 51d5e94601ceffbbd85688df1c928ecccbfa4685 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 18 Jan 2024 12:17:28 +1100 Subject: [PATCH 19/21] VERSION: release 1.1.12 Signed-off-by: Aleksa Sarai --- CHANGELOG.md | 21 ++++++++++++++++++++- VERSION | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7452f70faf0..70456291713 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased 1.1.z] +## [1.1.12] - 2024-01-31 + +> Now you're thinking with Portals™! + +### Security + +* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took + advantage of a file descriptor that was leaked internally within runc (but + never leaked to the container process). In addition to fixing the leak, + several strict hardening measures were added to ensure that future internal + leaks could not be used to break out in this manner again. Based on our + research, while no other container runtime had a similar leak, none had any + of the hardening steps we've introduced (and some runtimes would not check + for any file descriptors that a calling process may have leaked to them, + allowing for container breakouts due to basic user error). + +[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv + ## [1.1.11] - 2024-01-01 > Happy New Year! @@ -493,7 +511,8 @@ implementation (libcontainer) is *not* covered by this policy. [1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1 -[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.11...release-1.1 +[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1 +[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12 [1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11 [1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10 [1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9 diff --git a/VERSION b/VERSION index 5aafe9bab6b..ccad953ac53 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.11+dev +1.1.12 From 29d6d87345aad2103a50848671617abb4273d621 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 20 Jan 2024 17:37:33 +1100 Subject: [PATCH 20/21] VERSION: back to development Signed-off-by: Aleksa Sarai --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ccad953ac53..9e5f80d9e8b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.12 +1.1.12+dev From 13cb2b07589e8acf2b607419071198efc15b5d0d Mon Sep 17 00:00:00 2001 From: Ameya Gawde Date: Fri, 16 Feb 2024 13:49:16 -0800 Subject: [PATCH 21/21] Bump version to Mirantis ver 1.1.12-m1 Signed-off-by: Ameya Gawde --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 9e5f80d9e8b..81f32601221 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.1.12+dev +1.1.12-m1