diff --git a/devices/ebpf_linux.go b/devices/ebpf_linux.go index cfc36f7..831248e 100644 --- a/devices/ebpf_linux.go +++ b/devices/ebpf_linux.go @@ -1,6 +1,7 @@ package devices import ( + "bytes" "errors" "fmt" "os" @@ -8,14 +9,130 @@ import ( "sync" "unsafe" - "github.com/cilium/ebpf" "github.com/cilium/ebpf/asm" - "github.com/cilium/ebpf/link" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) -func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error) { +func bpf(cmd uintptr, attr unsafe.Pointer, size uintptr) (uintptr, error) { + r1, _, err := unix.Syscall(unix.SYS_BPF, cmd, uintptr(attr), size) + runtime.KeepAlive(attr) + if err != 0 { + return r1, err + } + return r1, nil +} + +// bpfProgLoad loads a BPF_PROG_TYPE_CGROUP_DEVICE program and returns its fd. +func bpfProgLoad(insns asm.Instructions, license string) (int, error) { + buf := bytes.NewBuffer(make([]byte, 0, insns.Size())) + if err := insns.Marshal(buf, nativeEndian); err != nil { + return -1, err + } + insnsBytes := buf.Bytes() + + licensePtr, err := unix.BytePtrFromString(license) + if err != nil { + return -1, err + } + + // Subset of struct bpf_attr for BPF_PROG_LOAD. Fields past the ones we set + // are left zero; the kernel zero-fills any part of bpf_attr beyond the size + // we pass. + attr := struct { + progType uint32 + insnCnt uint32 + insns uint64 // pointer + license uint64 // pointer + logLevel uint32 + logSize uint32 + logBuf uint64 // pointer + }{ + progType: unix.BPF_PROG_TYPE_CGROUP_DEVICE, + insnCnt: uint32(len(insnsBytes) / asm.InstructionSize), + insns: uint64(uintptr(unsafe.Pointer(&insnsBytes[0]))), + license: uint64(uintptr(unsafe.Pointer(licensePtr))), + } + + fd, err := bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + // attr holds the pointers as integers, so the GC can't see them; keep the + // referenced objects alive until the syscall returns. + runtime.KeepAlive(insnsBytes) + runtime.KeepAlive(licensePtr) + if err == nil { + return int(fd), nil + } + + // The load failed. Retry with the verifier log enabled so we can include + // it in the error (the first attempt skips it, as it is the fast path). + log := make([]byte, 64*1024) + attr.logLevel = 1 + attr.logSize = uint32(len(log)) + attr.logBuf = uint64(uintptr(unsafe.Pointer(&log[0]))) + + fd, err = bpf(unix.BPF_PROG_LOAD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + runtime.KeepAlive(insnsBytes) + runtime.KeepAlive(licensePtr) + runtime.KeepAlive(log) + if err == nil { + return int(fd), nil + } + if n := bytes.IndexByte(log, 0); n > 0 { + return -1, fmt.Errorf("%w: %s", err, bytes.TrimRight(log[:n], "\n")) + } + return -1, err +} + +// bpfProgGetFdByID returns the fd for the BPF program with the given ID. +func bpfProgGetFdByID(id uint32) (int, error) { + // The kernel zero-fills the rest of bpf_attr beyond the size we pass. + attr := struct{ id uint32 }{id} + fd, err := bpf(unix.BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return -1, err + } + return int(fd), nil +} + +// bpfProgAttach attaches progFd to cgroupFd with the given flags. If replaceFd +// is >= 0, its fd is set in replaceBpfFd (for BPF_F_REPLACE semantics). +func bpfProgAttach(cgroupFd, progFd int, attachFlags uint32, replaceFd int) error { + attr := struct { + targetFd uint32 + attachBpfFd uint32 + attachType uint32 + attachFlags uint32 + replaceBpfFd uint32 + }{ + targetFd: uint32(cgroupFd), + attachBpfFd: uint32(progFd), + attachType: uint32(unix.BPF_CGROUP_DEVICE), + attachFlags: attachFlags, + } + if replaceFd >= 0 { + attr.replaceBpfFd = uint32(replaceFd) + } + _, err := bpf(unix.BPF_PROG_ATTACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +// bpfProgDetach detaches progFd from cgroupFd. +func bpfProgDetach(cgroupFd, progFd int) error { + // The kernel zero-fills the rest of bpf_attr beyond the size we pass. + attr := struct { + targetFd uint32 + attachBpfFd uint32 + attachType uint32 + }{ + targetFd: uint32(cgroupFd), + attachBpfFd: uint32(progFd), + attachType: uint32(unix.BPF_CGROUP_DEVICE), + } + _, err := bpf(unix.BPF_PROG_DETACH, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func findAttachedCgroupDeviceFilters(dirFd int) (_ []int, retErr error) { type bpfAttrQuery struct { TargetFd uint32 AttachType uint32 @@ -37,36 +154,34 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error ProgCnt: uint32(len(progIds)), } - // Fetch the list of program ids. - _, _, errno := unix.Syscall(unix.SYS_BPF, - uintptr(unix.BPF_PROG_QUERY), - uintptr(unsafe.Pointer(&query)), - unsafe.Sizeof(query)) + // Fetch the list of program ids. bpf() keeps &query alive for the + // duration of the syscall, and query.ProgCnt is read right after. + _, err := bpf(unix.BPF_PROG_QUERY, unsafe.Pointer(&query), unsafe.Sizeof(query)) + runtime.KeepAlive(progIds) size = int(query.ProgCnt) - runtime.KeepAlive(query) - if errno != 0 { + if err != nil { // On ENOSPC we get the correct number of programs. - if errno == unix.ENOSPC { + if errors.Is(err, unix.ENOSPC) { retries++ continue } - return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) + return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", err) } - // Convert the ids to program handles. - // On error we don't return the programs slice, so close the fds stored there. + // Convert the ids to program fds. + // On error we don't return the fds slice, so close the fds stored there. progIds = progIds[:size] - programs := make([]*ebpf.Program, 0, len(progIds)) + fds := make([]int, 0, len(progIds)) defer func() { if retErr != nil { - for _, p := range programs { - p.Close() + for _, fd := range fds { + unix.Close(fd) } } }() for _, progId := range progIds { - program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) + fd, err := bpfProgGetFdByID(progId) if err != nil { // We skip over programs that give us -EACCES or -EPERM. This // is necessary because there may be BPF programs that have @@ -83,10 +198,10 @@ func findAttachedCgroupDeviceFilters(dirFd int) (_ []*ebpf.Program, retErr error } return nil, fmt.Errorf("cannot fetch program from id: %w", err) } - programs = append(programs, program) + fds = append(fds, fd) } runtime.KeepAlive(progIds) - return programs, nil + return fds, nil } return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") @@ -99,23 +214,17 @@ var ( // Loosely based on the BPF_F_REPLACE support check in // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. -// -// TODO: move this logic to cilium/ebpf func haveBpfProgReplace() bool { haveBpfProgReplaceOnce.Do(func() { - prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ - Type: ebpf.CGroupDevice, - License: "MIT", - Instructions: asm.Instructions{ - asm.Mov.Imm(asm.R0, 0), - asm.Return(), - }, - }) + progFd, err := bpfProgLoad(asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, "MIT") if err != nil { - logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) + logrus.Warnf("checking for BPF_F_REPLACE support: bpfProgLoad failed: %v", err) return } - defer prog.Close() + defer unix.Close(progFd) devnull, err := os.Open("/dev/null") if err != nil { @@ -127,24 +236,19 @@ func haveBpfProgReplace() bool { // We know that we have BPF_PROG_ATTACH since we can load // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL // we know that the feature isn't present. - err = link.RawAttachProgram(link.RawAttachProgramOptions{ - // We rely on this fd being checked after attachFlags in the kernel. - Target: int(devnull.Fd()), - // Attempt to "replace" our BPF program with itself. This will - // always fail, but we should get -EINVAL if BPF_F_REPLACE is not - // supported. - Anchor: link.ReplaceProgram(prog), - Program: prog, - Attach: ebpf.AttachCGroupDevice, - Flags: unix.BPF_F_ALLOW_MULTI, - }) - if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) { + // + // We rely on the target fd being checked after attachFlags in the + // kernel. Attempting to "replace" our BPF program with itself always + // fails, but we should get -EINVAL if BPF_F_REPLACE is not supported, + // and -EBADF (from the dummy target fd) if it is. + err = bpfProgAttach(int(devnull.Fd()), progFd, unix.BPF_F_ALLOW_MULTI|unix.BPF_F_REPLACE, progFd) + if errors.Is(err, unix.EINVAL) { // not supported return } if !errors.Is(err, unix.EBADF) { // If we see any new errors here, it's possible that there is a - // regression due to a cilium/ebpf update and the above EINVAL + // regression due to a kernel update and the above EINVAL // checks are not working. So, be loud about it so someone notices // and we can get the issue fixed quicker. logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) @@ -169,83 +273,58 @@ func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) // Get the list of existing programs. - oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) + oldFds, err := findAttachedCgroupDeviceFilters(dirFd) if err != nil { return err } defer func() { - for _, p := range oldProgs { - p.Close() + for _, fd := range oldFds { + unix.Close(fd) } }() - useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 + useReplaceProg := haveBpfProgReplace() && len(oldFds) == 1 // Generate new program. - spec := &ebpf.ProgramSpec{ - Type: ebpf.CGroupDevice, - Instructions: insts, - License: license, - } - prog, err := ebpf.NewProgram(spec) + progFd, err := bpfProgLoad(insts, license) if err != nil { return err } - defer prog.Close() + // Once the program is attached, the kernel keeps it alive via the cgroup + // attachment, so we no longer need our own fd; we also don't need it if the + // attach below fails. Either way, close it on return. + defer unix.Close(progFd) // If there is only one old program, we can just replace it directly. - - attachProgramOptions := link.RawAttachProgramOptions{ - Target: dirFd, - Program: prog, - Attach: ebpf.AttachCGroupDevice, - Flags: unix.BPF_F_ALLOW_MULTI, - } - + replaceFd := -1 + attachFlags := uint32(unix.BPF_F_ALLOW_MULTI) if useReplaceProg { - attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0]) + replaceFd = oldFds[0] + attachFlags |= unix.BPF_F_REPLACE } - err = link.RawAttachProgram(attachProgramOptions) + err = bpfProgAttach(dirFd, progFd, attachFlags, replaceFd) if err != nil { return fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) } + if !useReplaceProg { logLevel := logrus.DebugLevel // If there was more than one old program, give a warning (since this // really shouldn't happen with runc-managed cgroups) and then detach // all the old programs. - if len(oldProgs) > 1 { + if len(oldFds) > 1 { // NOTE: Ideally this should be a warning but it turns out that // systemd-managed cgroups trigger this warning (apparently // systemd doesn't delete old non-systemd programs when // setting properties). - logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) + logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldFds)) logLevel = logrus.InfoLevel } - for idx, oldProg := range oldProgs { - // Output some extra debug info. - if info, err := oldProg.Info(); err == nil { - fields := logrus.Fields{ - "type": info.Type.String(), - "tag": info.Tag, - "name": info.Name, - } - if id, ok := info.ID(); ok { - fields["id"] = id - } - if runCount, ok := info.RunCount(); ok { - fields["run_count"] = runCount - } - if runtime, ok := info.Runtime(); ok { - fields["runtime"] = runtime.String() - } - logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) - } - err = link.RawDetachProgram(link.RawDetachProgramOptions{ - Target: dirFd, - Program: oldProg, - Attach: ebpf.AttachCGroupDevice, - }) + for idx, oldFd := range oldFds { + logrus.WithFields(logrus.Fields{ + "fd": oldFd, + }).Logf(logLevel, "removing old filter %d from cgroup", idx) + err = bpfProgDetach(dirFd, oldFd) if err != nil { return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) } diff --git a/devices/endian_be.go b/devices/endian_be.go new file mode 100644 index 0000000..3219b88 --- /dev/null +++ b/devices/endian_be.go @@ -0,0 +1,9 @@ +//go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64 + +package devices + +import "encoding/binary" + +// nativeEndian is used as a workaround for cilium/ebpf/asm +// which does not accept binary.NativeEndian. +var nativeEndian = binary.BigEndian diff --git a/devices/endian_le.go b/devices/endian_le.go new file mode 100644 index 0000000..de9083a --- /dev/null +++ b/devices/endian_le.go @@ -0,0 +1,9 @@ +//go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 || wasm + +package devices + +import "encoding/binary" + +// nativeEndian is used as a workaround for cilium/ebpf/asm +// which does not accept binary.NativeEndian. +var nativeEndian = binary.LittleEndian diff --git a/devices/endian_test.go b/devices/endian_test.go new file mode 100644 index 0000000..31747f4 --- /dev/null +++ b/devices/endian_test.go @@ -0,0 +1,19 @@ +package devices + +import ( + "encoding/binary" + "runtime" + "testing" +) + +func TestNativeEndian(t *testing.T) { + const want = uint64(0x0102030405060708) + + var got, native [8]byte + nativeEndian.PutUint64(got[:], want) + binary.NativeEndian.PutUint64(native[:], want) + + if got != native { + t.Fatalf("Build constraints for GOARCH=%s are wrong; fix endian_{be,le}.go", runtime.GOARCH) + } +} diff --git a/go.sum b/go.sum index db73c8e..136c60f 100644 --- a/go.sum +++ b/go.sum @@ -13,18 +13,10 @@ github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= -github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= -github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= -github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= -github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= -github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg= github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4= github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= @@ -39,10 +31,6 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=