podman cgroup enhancement

currently, setting any sort of resource limit in a pod does nothing. With the newly refactored creation process in c/common, podman ca now set resources at a pod level
meaning that resource related flags can now be exposed to podman pod create.

cgroupfs and systemd are both supported with varying completion. cgroupfs is a much simpler process and one that is virtually complete for all resource types, the flags now just need to be added. systemd on the other hand
has to be handeled via the dbus api meaning that the limits need to be passed as recognized properties to systemd. The properties added so far are the ones that podman pod create supports as well as `cpuset-mems` as this will
be the next flag I work on.

Signed-off-by: Charlie Doern <cdoern@redhat.com>
This commit is contained in:
cdoern 2022-06-13 15:35:16 -04:00 committed by Charlie Doern
parent 95707a08bf
commit 2792e598c7
70 changed files with 5660 additions and 307 deletions

4
go.mod
View file

@ -12,7 +12,7 @@ require (
github.com/containernetworking/cni v1.1.1
github.com/containernetworking/plugins v1.1.1
github.com/containers/buildah v1.26.1-0.20220609225314-e66309ebde8c
github.com/containers/common v0.48.1-0.20220608111710-dbecabbe82c9
github.com/containers/common v0.48.1-0.20220624132904-722a80e139ec
github.com/containers/conmon v2.0.20+incompatible
github.com/containers/image/v5 v5.21.2-0.20220617075545-929f14a56f5c
github.com/containers/ocicrypt v1.1.5
@ -75,3 +75,5 @@ require (
)
require github.com/docker/libnetwork v0.8.0-dev.2.0.20190625141545-5a177b73e316 // indirect
replace github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20220617142545-8b9452f75cbc

28
go.sum
View file

@ -198,8 +198,6 @@ github.com/charithe/durationcheck v0.0.9/go.mod h1:SSbRIBVfMjCi/kEB6K65XEA83D6pr
github.com/chavacava/garif v0.0.0-20210405164556-e8a0a408d6af/go.mod h1:Qjyv4H3//PWVzTeCezG2b9IRn6myJxJSr4TD/xo6ojU=
github.com/checkpoint-restore/checkpointctl v0.0.0-20220321135231-33f4a66335f0 h1:txB5jvhzUCSiiQmqmMWpo5CEB7Gj/Hq5Xqi7eaPl8ko=
github.com/checkpoint-restore/checkpointctl v0.0.0-20220321135231-33f4a66335f0/go.mod h1:67kWC1PXQLR3lM/mmNnu3Kzn7K4TSWZAGUuQP1JSngk=
github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw=
github.com/checkpoint-restore/go-criu/v5 v5.0.0/go.mod h1:cfwC0EG7HMUenopBsUf9d89JlCLQIfgVcNsNN0t6T2M=
github.com/checkpoint-restore/go-criu/v5 v5.2.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E=
github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8=
github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E=
@ -213,7 +211,6 @@ github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmE
github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc=
github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs=
github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA=
github.com/cilium/ebpf v0.9.0/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
@ -340,12 +337,14 @@ github.com/containernetworking/plugins v1.1.1 h1:+AGfFigZ5TiQH00vhR8qPeSatj53eNG
github.com/containernetworking/plugins v1.1.1/go.mod h1:Sr5TH/eBsGLXK/h71HeLfX19sZPp3ry5uHSkI4LPxV8=
github.com/containers/buildah v1.26.1-0.20220609225314-e66309ebde8c h1:/fKyiLFFuceBPZGJ0Lig7ElURhfsslAOw1BOcItD+X8=
github.com/containers/buildah v1.26.1-0.20220609225314-e66309ebde8c/go.mod h1:b0L+u2Dam7soWGn5sVTK31L++Xrf80AbGvK5z9D2+lw=
github.com/containers/common v0.48.1-0.20220608111710-dbecabbe82c9 h1:sK+TNC8oUBkruZTIqwYJrENetSLQnk+goBVyLiqsJq8=
github.com/containers/common v0.48.1-0.20220608111710-dbecabbe82c9/go.mod h1:WBLwq+i7bicCpH54V70HM6s7jqDAESTlYnd05XXp0ac=
github.com/containers/common v0.48.1-0.20220624132904-722a80e139ec h1:kcfKciMCi6/A72M8TSX2ZYJFa5Yu9hC2Mct8FkuW1Xw=
github.com/containers/common v0.48.1-0.20220624132904-722a80e139ec/go.mod h1:KDNk8Lazjjjqs9643cKH9dnq6IXB4Lf7evya5GiFcg0=
github.com/containers/conmon v2.0.20+incompatible h1:YbCVSFSCqFjjVwHTPINGdMX1F6JXHGTUje2ZYobNrkg=
github.com/containers/conmon v2.0.20+incompatible/go.mod h1:hgwZ2mtuDrppv78a/cOBNiCm6O0UMWGx1mu7P00nu5I=
github.com/containers/image/v5 v5.21.2-0.20220511203756-fe4fd4ed8be4/go.mod h1:OsX9sFexyGF0FCNAjfcVFv3IwMqDyLyV/WQY/roLPcE=
github.com/containers/image/v5 v5.21.2-0.20220520105616-e594853d6471/go.mod h1:KntCBNQn3qOuZmQuJ38ORyTozmWXiuo05Vef2S0Sm5M=
github.com/containers/image/v5 v5.21.2-0.20220615100411-a78a00792916/go.mod h1:hEf8L08Hrh/3fK4vLf5l7988MJmij2swfCBUzqgnhF4=
github.com/containers/image/v5 v5.21.2-0.20220617075545-929f14a56f5c h1:U2F/FaMt8gPP8sIpBfvXMCP5gAZfyxoYZ7lmu0dwsXc=
github.com/containers/image/v5 v5.21.2-0.20220617075545-929f14a56f5c/go.mod h1:hEf8L08Hrh/3fK4vLf5l7988MJmij2swfCBUzqgnhF4=
github.com/containers/libtrust v0.0.0-20200511145503-9c3a6c22cd9a h1:spAGlqziZjCJL25C6F1zsQY05tfCKE9F5YwtEWWe6hU=
@ -365,6 +364,7 @@ github.com/containers/storage v1.38.0/go.mod h1:lBzt28gAk5ADZuRtwdndRJyqX22vnRaX
github.com/containers/storage v1.40.2/go.mod h1:zUyPC3CFIGR1OhY1CKkffxgw9+LuH76PGvVcFj38dgs=
github.com/containers/storage v1.41.0/go.mod h1:Pb0l5Sm/89kolX3o2KolKQ5cCHk5vPNpJrhNaLcdS5s=
github.com/containers/storage v1.41.1-0.20220607143333-8951d0153bf6/go.mod h1:6XQ68cEG8ojfP/m3HIupFV1rZsnqeFmaE8N1ctBP94Y=
github.com/containers/storage v1.41.1-0.20220614214904-388efef4bf7e/go.mod h1:6XQ68cEG8ojfP/m3HIupFV1rZsnqeFmaE8N1ctBP94Y=
github.com/containers/storage v1.41.1-0.20220616120034-7df64288ef35 h1:6o+kw2z0BrdJ1wNYUbwLVzlb/65KPmZSy+32GNfppzM=
github.com/containers/storage v1.41.1-0.20220616120034-7df64288ef35/go.mod h1:6XQ68cEG8ojfP/m3HIupFV1rZsnqeFmaE8N1ctBP94Y=
github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk=
@ -399,7 +399,6 @@ github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7Do
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/creack/pty v1.1.11 h1:07n33Z8lZxZ2qwegKbObQohDhXDQxiMMz1NOUGYlesw=
github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4=
github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ=
@ -1053,19 +1052,8 @@ github.com/opencontainers/image-spec v1.0.3-0.20211202183452-c5a74bcca799/go.mod
github.com/opencontainers/image-spec v1.0.3-0.20211202193544-a5463b7f9c84/go.mod h1:Qnt1q4cjDNQI9bT832ziho5Iw2BhK8o1KwLOwW56VP4=
github.com/opencontainers/image-spec v1.0.3-0.20220114050600-8b9d41f48198 h1:+czc/J8SlhPKLOtVLMQc+xDCFBT73ZStMsRhSsUhsSg=
github.com/opencontainers/image-spec v1.0.3-0.20220114050600-8b9d41f48198/go.mod h1:j4h1pJW6ZcJTgMZWP3+7RlG3zTaP02aDZ/Qw0sppK7Q=
github.com/opencontainers/runc v0.0.0-20190115041553-12f6a991201f/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
github.com/opencontainers/runc v0.1.1/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
github.com/opencontainers/runc v1.0.0-rc8.0.20190926000215-3e425f80a8c9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
github.com/opencontainers/runc v1.0.0-rc9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U=
github.com/opencontainers/runc v1.0.0-rc93/go.mod h1:3NOsor4w32B2tC0Zbl8Knk4Wg84SM2ImC1fxBuqJ/H0=
github.com/opencontainers/runc v1.0.2/go.mod h1:aTaHFFwQXuA71CiyxOdFFIorAoemI04suvGRQFzWTD0=
github.com/opencontainers/runc v1.0.3/go.mod h1:aTaHFFwQXuA71CiyxOdFFIorAoemI04suvGRQFzWTD0=
github.com/opencontainers/runc v1.1.0/go.mod h1:Tj1hFw6eFWp/o33uxGf5yF2BX5yz2Z6iptFpuvbbKqc=
github.com/opencontainers/runc v1.1.1-0.20220607072441-a7a45d7d2721/go.mod h1:QvA0UNe48mC1JxcXq0sENIR38+/LdJMLNxuAvtFBhxA=
github.com/opencontainers/runc v1.1.1/go.mod h1:Tj1hFw6eFWp/o33uxGf5yF2BX5yz2Z6iptFpuvbbKqc=
github.com/opencontainers/runc v1.1.2/go.mod h1:Tj1hFw6eFWp/o33uxGf5yF2BX5yz2Z6iptFpuvbbKqc=
github.com/opencontainers/runc v1.1.3 h1:vIXrkId+0/J2Ymu2m7VjGvbSlAId9XNRPhn2p4b+d8w=
github.com/opencontainers/runc v1.1.3/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg=
github.com/opencontainers/runc v1.1.1-0.20220617142545-8b9452f75cbc h1:qjkUzmFsOFbQyjObybk40mRida83j5IHRaKzLGdBbEU=
github.com/opencontainers/runc v1.1.1-0.20220617142545-8b9452f75cbc/go.mod h1:wUOQGsiKae6VzA/UvlCK3cO+pHk8F2VQHlIoITEfMM8=
github.com/opencontainers/runtime-spec v0.1.2-0.20190507144316-5b71a03e2700/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.0.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.0.2-0.20190207185410-29686dbc5559/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
@ -1198,10 +1186,10 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds=
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/seccomp/libseccomp-golang v0.10.0 h1:aA4bp+/Zzi0BnWZ2F1wgNBs5gTpm+na2rWM6M9YjLpY=
github.com/seccomp/libseccomp-golang v0.10.0/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/securego/gosec/v2 v2.9.1/go.mod h1:oDcDLcatOJxkCGaCaq8lua1jTnYf6Sou4wdiJ1n4iHc=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=

View file

@ -424,7 +424,6 @@ type InfraInherit struct {
CapDrop []string `json:"cap_drop,omitempty"`
HostDeviceList []spec.LinuxDevice `json:"host_device_list,omitempty"`
ImageVolumes []*specgen.ImageVolume `json:"image_volumes,omitempty"`
InfraResources *spec.LinuxResources `json:"resource_limits,omitempty"`
Mounts []spec.Mount `json:"mounts,omitempty"`
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
OverlayVolumes []*specgen.OverlayVolume `json:"overlay_volumes,omitempty"`

View file

@ -870,6 +870,7 @@ func (c *Container) generateSpec(ctx context.Context) (*spec.Spec, error) {
if err != nil {
return nil, err
}
g.SetLinuxCgroupsPath(cgroupPath)
// Warning: CDI may alter g.Config in place.

View file

@ -23,6 +23,9 @@ import (
"text/template"
"time"
runcconfig "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/common/pkg/config"
conmonConfig "github.com/containers/conmon/runner/config"
@ -1451,9 +1454,14 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec
// TODO: This should be a switch - we are not guaranteed that
// there are only 2 valid cgroup managers
cgroupParent := ctr.CgroupParent()
cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
Resource := ctr.Spec().Linux.Resources
cgroupResources, err := GetLimits(Resource)
if err != nil {
logrus.StandardLogger().Log(logLevel, "Could not get ctr resources")
}
if ctr.CgroupManager() == config.SystemdCgroupsManager {
unitName := createUnitName("libpod-conmon", ctr.ID())
realCgroupParent := cgroupParent
splitParent := strings.Split(cgroupParent, "/")
if strings.HasSuffix(cgroupParent, ".slice") && len(splitParent) > 1 {
@ -1465,8 +1473,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to systemd sandbox cgroup: %v", err)
}
} else {
cgroupPath := filepath.Join(ctr.config.CgroupParent, "conmon")
control, err := cgroups.New(cgroupPath, &spec.LinuxResources{})
control, err := cgroups.New(cgroupPath, &cgroupResources)
if err != nil {
logrus.StandardLogger().Logf(logLevel, "Failed to add conmon to cgroupfs sandbox cgroup: %v", err)
} else if err := control.AddPid(cmd.Process.Pid); err != nil {
@ -1748,3 +1755,191 @@ func httpAttachNonTerminalCopy(container *net.UnixConn, http *bufio.ReadWriter,
}
}
}
// GetLimits converts spec resource limits to cgroup consumable limits
func GetLimits(resource *spec.LinuxResources) (runcconfig.Resources, error) {
if resource == nil {
resource = &spec.LinuxResources{}
}
final := &runcconfig.Resources{}
devs := []*devices.Rule{}
// Devices
for _, entry := range resource.Devices {
if entry.Major == nil || entry.Minor == nil {
continue
}
runeType := 'a'
switch entry.Type {
case "b":
runeType = 'b'
case "c":
runeType = 'c'
}
devs = append(devs, &devices.Rule{
Type: devices.Type(runeType),
Major: *entry.Major,
Minor: *entry.Minor,
Permissions: devices.Permissions(entry.Access),
Allow: entry.Allow,
})
}
final.Devices = devs
// HugepageLimits
pageLimits := []*runcconfig.HugepageLimit{}
for _, entry := range resource.HugepageLimits {
pageLimits = append(pageLimits, &runcconfig.HugepageLimit{
Pagesize: entry.Pagesize,
Limit: entry.Limit,
})
}
final.HugetlbLimit = pageLimits
// Networking
netPriorities := []*runcconfig.IfPrioMap{}
if resource.Network != nil {
for _, entry := range resource.Network.Priorities {
netPriorities = append(netPriorities, &runcconfig.IfPrioMap{
Interface: entry.Name,
Priority: int64(entry.Priority),
})
}
}
final.NetPrioIfpriomap = netPriorities
rdma := make(map[string]runcconfig.LinuxRdma)
for name, entry := range resource.Rdma {
rdma[name] = runcconfig.LinuxRdma{HcaHandles: entry.HcaHandles, HcaObjects: entry.HcaObjects}
}
final.Rdma = rdma
// Memory
if resource.Memory != nil {
if resource.Memory.Limit != nil {
final.Memory = *resource.Memory.Limit
}
if resource.Memory.Reservation != nil {
final.MemoryReservation = *resource.Memory.Reservation
}
if resource.Memory.Swap != nil {
final.MemorySwap = *resource.Memory.Swap
}
if resource.Memory.Swappiness != nil {
final.MemorySwappiness = resource.Memory.Swappiness
}
}
// CPU
if resource.CPU != nil {
if resource.CPU.Period != nil {
final.CpuPeriod = *resource.CPU.Period
}
if resource.CPU.Quota != nil {
final.CpuQuota = *resource.CPU.Quota
}
if resource.CPU.RealtimePeriod != nil {
final.CpuRtPeriod = *resource.CPU.RealtimePeriod
}
if resource.CPU.RealtimeRuntime != nil {
final.CpuRtRuntime = *resource.CPU.RealtimeRuntime
}
if resource.CPU.Shares != nil {
final.CpuShares = *resource.CPU.Shares
}
final.CpusetCpus = resource.CPU.Cpus
final.CpusetMems = resource.CPU.Mems
}
// BlkIO
if resource.BlockIO != nil {
if len(resource.BlockIO.ThrottleReadBpsDevice) > 0 {
for _, entry := range resource.BlockIO.ThrottleReadBpsDevice {
throttle := &runcconfig.ThrottleDevice{}
dev := &runcconfig.BlockIODevice{
Major: entry.Major,
Minor: entry.Minor,
}
throttle.BlockIODevice = *dev
throttle.Rate = entry.Rate
final.BlkioThrottleReadBpsDevice = append(final.BlkioThrottleReadBpsDevice, throttle)
}
}
if len(resource.BlockIO.ThrottleWriteBpsDevice) > 0 {
for _, entry := range resource.BlockIO.ThrottleWriteBpsDevice {
throttle := &runcconfig.ThrottleDevice{}
dev := &runcconfig.BlockIODevice{
Major: entry.Major,
Minor: entry.Minor,
}
throttle.BlockIODevice = *dev
throttle.Rate = entry.Rate
final.BlkioThrottleWriteBpsDevice = append(final.BlkioThrottleWriteBpsDevice, throttle)
}
}
if len(resource.BlockIO.ThrottleReadIOPSDevice) > 0 {
for _, entry := range resource.BlockIO.ThrottleReadIOPSDevice {
throttle := &runcconfig.ThrottleDevice{}
dev := &runcconfig.BlockIODevice{
Major: entry.Major,
Minor: entry.Minor,
}
throttle.BlockIODevice = *dev
throttle.Rate = entry.Rate
final.BlkioThrottleReadIOPSDevice = append(final.BlkioThrottleReadIOPSDevice, throttle)
}
}
if len(resource.BlockIO.ThrottleWriteIOPSDevice) > 0 {
for _, entry := range resource.BlockIO.ThrottleWriteIOPSDevice {
throttle := &runcconfig.ThrottleDevice{}
dev := &runcconfig.BlockIODevice{
Major: entry.Major,
Minor: entry.Minor,
}
throttle.BlockIODevice = *dev
throttle.Rate = entry.Rate
final.BlkioThrottleWriteIOPSDevice = append(final.BlkioThrottleWriteIOPSDevice, throttle)
}
}
if resource.BlockIO.LeafWeight != nil {
final.BlkioLeafWeight = *resource.BlockIO.LeafWeight
}
if resource.BlockIO.Weight != nil {
final.BlkioWeight = *resource.BlockIO.Weight
}
if len(resource.BlockIO.WeightDevice) > 0 {
for _, entry := range resource.BlockIO.WeightDevice {
weight := &runcconfig.WeightDevice{}
dev := &runcconfig.BlockIODevice{
Major: entry.Major,
Minor: entry.Minor,
}
if entry.Weight != nil {
weight.Weight = *entry.Weight
}
if entry.LeafWeight != nil {
weight.LeafWeight = *entry.LeafWeight
}
weight.BlockIODevice = *dev
final.BlkioWeightDevice = append(final.BlkioWeightDevice, weight)
}
}
}
// Pids
if resource.Pids != nil {
final.PidsLimit = resource.Pids.Limit
}
// Networking
if resource.Network != nil {
if resource.Network.ClassID != nil {
final.NetClsClassid = *resource.Network.ClassID
}
}
// Unified state
final.Unified = resource.Unified
return *final, nil
}

View file

@ -69,7 +69,7 @@ func (p *Pod) refresh() error {
if p.config.UsePodCgroup {
switch p.runtime.config.Engine.CgroupManager {
case config.SystemdCgroupsManager:
cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()))
cgroupPath, err := systemdSliceFromPath(p.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", p.ID()), p.ResourceLim())
if err != nil {
logrus.Errorf("Creating Cgroup for pod %s: %v", p.ID(), err)
}

View file

@ -17,7 +17,7 @@ import (
"github.com/containers/podman/v4/libpod/events"
"github.com/containers/podman/v4/pkg/rootless"
"github.com/containers/podman/v4/pkg/specgen"
spec "github.com/opencontainers/runtime-spec/specs-go"
runcconfig "github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@ -66,6 +66,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
case config.CgroupfsCgroupsManager:
canUseCgroup := !rootless.IsRootless() || isRootlessCgroupSet(pod.config.CgroupParent)
if canUseCgroup {
// need to actually create parent here
if pod.config.CgroupParent == "" {
pod.config.CgroupParent = CgroupfsDefaultCgroupParent
} else if strings.HasSuffix(path.Base(pod.config.CgroupParent), ".slice") {
@ -73,12 +74,26 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
}
// If we are set to use pod cgroups, set the cgroup parent that
// all containers in the pod will share
// No need to create it with cgroupfs - the first container to
// launch should do it for us
if pod.config.UsePodCgroup {
pod.state.CgroupPath = filepath.Join(pod.config.CgroupParent, pod.ID())
if p.InfraContainerSpec != nil {
p.InfraContainerSpec.CgroupParent = pod.state.CgroupPath
res, err := GetLimits(p.InfraContainerSpec.ResourceLimits)
if err != nil {
return nil, err
}
// Need to both create and update the cgroup
// rather than create a new path in c/common for pod cgroup creation
// just create as if it is a ctr and then update figures out that we need to
// populate the resource limits on the pod level
cgc, err := cgroups.New(pod.state.CgroupPath, &res)
if err != nil {
return nil, err
}
err = cgc.Update(&res)
if err != nil {
return nil, err
}
}
}
}
@ -95,7 +110,7 @@ func (r *Runtime) NewPod(ctx context.Context, p specgen.PodSpecGenerator, option
// If we are set to use pod cgroups, set the cgroup parent that
// all containers in the pod will share
if pod.config.UsePodCgroup {
cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()))
cgroupPath, err := systemdSliceFromPath(pod.config.CgroupParent, fmt.Sprintf("libpod_pod_%s", pod.ID()), p.InfraContainerSpec.ResourceLimits)
if err != nil {
return nil, errors.Wrapf(err, "unable to create pod cgroup for pod %s", pod.ID())
}
@ -239,9 +254,8 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
}
// New resource limits
resLimits := new(spec.LinuxResources)
resLimits.Pids = new(spec.LinuxPids)
resLimits.Pids.Limit = 1 // Inhibit forks with very low pids limit
resLimits := new(runcconfig.Resources)
resLimits.PidsLimit = 1 // Inhibit forks with very low pids limit
// Don't try if we failed to retrieve the cgroup
if err == nil {
@ -321,7 +335,7 @@ func (r *Runtime) removePod(ctx context.Context, p *Pod, removeCtrs, force bool,
switch p.runtime.config.Engine.CgroupManager {
case config.SystemdCgroupsManager:
if err := deleteSystemdCgroup(p.state.CgroupPath); err != nil {
if err := deleteSystemdCgroup(p.state.CgroupPath, p.ResourceLim()); err != nil {
if removalErr == nil {
removalErr = errors.Wrapf(err, "error removing pod %s cgroup", p.ID())
} else {

View file

@ -9,6 +9,8 @@ import (
"syscall"
"time"
runccgroup "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/containers/common/pkg/cgroups"
"github.com/containers/podman/v4/libpod/define"
"github.com/pkg/errors"
@ -69,29 +71,29 @@ func (c *Container) GetContainerStats(previousStats *define.ContainerStats) (*de
// If the current total usage in the cgroup is less than what was previously
// recorded then it means the container was restarted and runs in a new cgroup
if previousStats.Duration > cgroupStats.CPU.Usage.Total {
if previousStats.Duration > cgroupStats.CpuStats.CpuUsage.TotalUsage {
previousStats = &define.ContainerStats{}
}
previousCPU := previousStats.CPUNano
now := uint64(time.Now().UnixNano())
stats.Duration = cgroupStats.CPU.Usage.Total
stats.Duration = cgroupStats.CpuStats.CpuUsage.TotalUsage
stats.UpTime = time.Duration(stats.Duration)
stats.CPU = calculateCPUPercent(cgroupStats, previousCPU, now, previousStats.SystemNano)
// calc the average cpu usage for the time the container is running
stats.AvgCPU = calculateCPUPercent(cgroupStats, 0, now, uint64(c.state.StartedTime.UnixNano()))
stats.MemUsage = cgroupStats.Memory.Usage.Usage
stats.MemUsage = cgroupStats.MemoryStats.Usage.Usage
stats.MemLimit = c.getMemLimit()
stats.MemPerc = (float64(stats.MemUsage) / float64(stats.MemLimit)) * 100
stats.PIDs = 0
if conState == define.ContainerStateRunning || conState == define.ContainerStatePaused {
stats.PIDs = cgroupStats.Pids.Current
stats.PIDs = cgroupStats.PidsStats.Current
}
stats.BlockInput, stats.BlockOutput = calculateBlockIO(cgroupStats)
stats.CPUNano = cgroupStats.CPU.Usage.Total
stats.CPUSystemNano = cgroupStats.CPU.Usage.Kernel
stats.CPUNano = cgroupStats.CpuStats.CpuUsage.TotalUsage
stats.CPUSystemNano = cgroupStats.CpuStats.CpuUsage.UsageInKernelmode
stats.SystemNano = now
stats.PerCPU = cgroupStats.CPU.Usage.PerCPU
stats.PerCPU = cgroupStats.CpuStats.CpuUsage.PercpuUsage
// Handle case where the container is not in a network namespace
if netStats != nil {
stats.NetInput = netStats.TxBytes
@ -133,10 +135,10 @@ func (c *Container) getMemLimit() uint64 {
// previousCPU is the last value of stats.CPU.Usage.Total measured at the time previousSystem.
// (now - previousSystem) is the time delta in nanoseconds, between the measurement in previousCPU
// and the updated value in stats.
func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSystem uint64) float64 {
func calculateCPUPercent(stats *runccgroup.Stats, previousCPU, now, previousSystem uint64) float64 {
var (
cpuPercent = 0.0
cpuDelta = float64(stats.CPU.Usage.Total - previousCPU)
cpuDelta = float64(stats.CpuStats.CpuUsage.TotalUsage - previousCPU)
systemDelta = float64(now - previousSystem)
)
if systemDelta > 0.0 && cpuDelta > 0.0 {
@ -146,8 +148,8 @@ func calculateCPUPercent(stats *cgroups.Metrics, previousCPU, now, previousSyste
return cpuPercent
}
func calculateBlockIO(stats *cgroups.Metrics) (read uint64, write uint64) {
for _, blkIOEntry := range stats.Blkio.IoServiceBytesRecursive {
func calculateBlockIO(stats *runccgroup.Stats) (read uint64, write uint64) {
for _, blkIOEntry := range stats.BlkioStats.IoServiceBytesRecursive {
switch strings.ToLower(blkIOEntry.Op) {
case "read":
read += blkIOEntry.Value

View file

@ -11,6 +11,7 @@ import (
"github.com/containers/common/pkg/cgroups"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/pkg/rootless"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@ -20,7 +21,7 @@ import (
// systemdSliceFromPath makes a new systemd slice under the given parent with
// the given name.
// The parent must be a slice. The name must NOT include ".slice"
func systemdSliceFromPath(parent, name string) (string, error) {
func systemdSliceFromPath(parent, name string, resources *spec.LinuxResources) (string, error) {
cgroupPath, err := assembleSystemdCgroupName(parent, name)
if err != nil {
return "", err
@ -28,7 +29,7 @@ func systemdSliceFromPath(parent, name string) (string, error) {
logrus.Debugf("Created cgroup path %s for parent %s and name %s", cgroupPath, parent, name)
if err := makeSystemdCgroup(cgroupPath); err != nil {
if err := makeSystemdCgroup(cgroupPath, resources); err != nil {
return "", errors.Wrapf(err, "error creating cgroup %s", cgroupPath)
}
@ -45,8 +46,12 @@ func getDefaultSystemdCgroup() string {
}
// makeSystemdCgroup creates a systemd Cgroup at the given location.
func makeSystemdCgroup(path string) error {
controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup())
func makeSystemdCgroup(path string, resources *spec.LinuxResources) error {
res, err := GetLimits(resources)
if err != nil {
return err
}
controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res)
if err != nil {
return err
}
@ -54,12 +59,20 @@ func makeSystemdCgroup(path string) error {
if rootless.IsRootless() {
return controller.CreateSystemdUserUnit(path, rootless.GetRootlessUID())
}
return controller.CreateSystemdUnit(path)
err = controller.CreateSystemdUnit(path)
if err != nil {
return err
}
return nil
}
// deleteSystemdCgroup deletes the systemd cgroup at the given location
func deleteSystemdCgroup(path string) error {
controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup())
func deleteSystemdCgroup(path string, resources *spec.LinuxResources) error {
res, err := GetLimits(resources)
if err != nil {
return err
}
controller, err := cgroups.NewSystemd(getDefaultSystemdCgroup(), &res)
if err != nil {
return err
}

View file

@ -12,6 +12,7 @@ import (
api "github.com/containers/podman/v4/pkg/api/types"
docker "github.com/docker/docker/api/types"
"github.com/gorilla/schema"
runccgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
@ -133,7 +134,7 @@ streamLabel: // A label to flatten the scope
}
cfg := ctnr.Config()
memoryLimit := cgroupStat.Memory.Usage.Limit
memoryLimit := cgroupStat.MemoryStats.Usage.Limit
if cfg.Spec.Linux != nil && cfg.Spec.Linux.Resources != nil && cfg.Spec.Linux.Resources.Memory != nil && *cfg.Spec.Linux.Resources.Memory.Limit > 0 {
memoryLimit = uint64(*cfg.Spec.Linux.Resources.Memory.Limit)
}
@ -144,11 +145,11 @@ streamLabel: // A label to flatten the scope
Read: time.Now(),
PreRead: preRead,
PidsStats: docker.PidsStats{
Current: cgroupStat.Pids.Current,
Current: cgroupStat.PidsStats.Current,
Limit: 0,
},
BlkioStats: docker.BlkioStats{
IoServiceBytesRecursive: toBlkioStatEntry(cgroupStat.Blkio.IoServiceBytesRecursive),
IoServiceBytesRecursive: toBlkioStatEntry(cgroupStat.BlkioStats.IoServiceBytesRecursive),
IoServicedRecursive: nil,
IoQueuedRecursive: nil,
IoServiceTimeRecursive: nil,
@ -159,14 +160,14 @@ streamLabel: // A label to flatten the scope
},
CPUStats: CPUStats{
CPUUsage: docker.CPUUsage{
TotalUsage: cgroupStat.CPU.Usage.Total,
PercpuUsage: cgroupStat.CPU.Usage.PerCPU,
UsageInKernelmode: cgroupStat.CPU.Usage.Kernel,
UsageInUsermode: cgroupStat.CPU.Usage.Total - cgroupStat.CPU.Usage.Kernel,
TotalUsage: cgroupStat.CpuStats.CpuUsage.TotalUsage,
PercpuUsage: cgroupStat.CpuStats.CpuUsage.PercpuUsage,
UsageInKernelmode: cgroupStat.CpuStats.CpuUsage.UsageInKernelmode,
UsageInUsermode: cgroupStat.CpuStats.CpuUsage.TotalUsage - cgroupStat.CpuStats.CpuUsage.UsageInKernelmode,
},
CPU: stats.CPU,
SystemUsage: systemUsage,
OnlineCPUs: uint32(len(cgroupStat.CPU.Usage.PerCPU)),
OnlineCPUs: uint32(len(cgroupStat.CpuStats.CpuUsage.PercpuUsage)),
ThrottlingData: docker.ThrottlingData{
Periods: 0,
ThrottledPeriods: 0,
@ -175,8 +176,8 @@ streamLabel: // A label to flatten the scope
},
PreCPUStats: preCPUStats,
MemoryStats: docker.MemoryStats{
Usage: cgroupStat.Memory.Usage.Usage,
MaxUsage: cgroupStat.Memory.Usage.Limit,
Usage: cgroupStat.MemoryStats.Usage.Usage,
MaxUsage: cgroupStat.MemoryStats.Usage.Limit,
Stats: nil,
Failcnt: 0,
Limit: memoryLimit,
@ -216,7 +217,7 @@ streamLabel: // A label to flatten the scope
}
}
func toBlkioStatEntry(entries []cgroups.BlkIOEntry) []docker.BlkioStatEntry {
func toBlkioStatEntry(entries []runccgroups.BlkioStatEntry) []docker.BlkioStatEntry {
results := make([]docker.BlkioStatEntry, len(entries))
for i, e := range entries {
bits, err := json.Marshal(e)

View file

@ -180,10 +180,23 @@ func MakeContainer(ctx context.Context, rt *libpod.Runtime, s *specgen.SpecGener
if err != nil {
return nil, nil, nil, err
}
resources := runtimeSpec.Linux.Resources
// resources get overwrritten similarly to pod inheritance, manually assign here if there is a new value
marshalRes, err := json.Marshal(resources)
if err != nil {
return nil, nil, nil, err
}
err = json.Unmarshal(out, runtimeSpec.Linux)
if err != nil {
return nil, nil, nil, err
}
err = json.Unmarshal(marshalRes, runtimeSpec.Linux.Resources)
if err != nil {
return nil, nil, nil, err
}
}
if s.ResourceLimits != nil {
switch {

View file

@ -298,8 +298,7 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt
g.AddAnnotation(key, val)
}
switch {
case compatibleOptions.InfraResources == nil && s.ResourceLimits != nil:
if s.ResourceLimits != nil {
out, err := json.Marshal(s.ResourceLimits)
if err != nil {
return nil, err
@ -308,29 +307,9 @@ func SpecGenToOCI(ctx context.Context, s *specgen.SpecGenerator, rt *libpod.Runt
if err != nil {
return nil, err
}
case s.ResourceLimits != nil: // if we have predefined resource limits we need to make sure we keep the infra and container limits
originalResources, err := json.Marshal(s.ResourceLimits)
if err != nil {
return nil, err
}
infraResources, err := json.Marshal(compatibleOptions.InfraResources)
if err != nil {
return nil, err
}
err = json.Unmarshal(infraResources, s.ResourceLimits) // put infra's resource limits in the container
if err != nil {
return nil, err
}
err = json.Unmarshal(originalResources, s.ResourceLimits) // make sure we did not override anything
if err != nil {
return nil, err
}
g.Config.Linux.Resources = s.ResourceLimits
default:
g.Config.Linux.Resources = compatibleOptions.InfraResources
}
// Devices
// set the default rule at the beginning of device configuration
if !inUserNS && !s.Privileged {
g.AddLinuxResourcesDevice(false, "", nil, nil, "rwm")

View file

@ -472,4 +472,30 @@ spec:
run_podman pod rm $name-pod
}
@test "pod resource limits" {
skip_if_remote "resource limits only implemented on non-remote"
if is_rootless; then
skip "only meaningful for rootful"
fi
local name1="resources1"
run_podman --cgroup-manager=systemd pod create --name=$name1 --cpus=5
run_podman --cgroup-manager=systemd pod start $name1
run_podman pod inspect --format '{{.CgroupPath}}' $name1
local path1="$output"
local actual1=$(< /sys/fs/cgroup/$path1/cpu.max)
is "$actual1" "500000 100000" "resource limits set properly"
run_podman pod --cgroup-manager=systemd rm -f $name1
local name2="resources2"
run_podman --cgroup-manager=cgroupfs pod create --cpus=5 --name=$name2
run_podman --cgroup-manager=cgroupfs pod start $name2
run_podman pod inspect --format '{{.CgroupPath}}' $name2
local path2="$output"
local actual2=$(< /sys/fs/cgroup/$path2/cpu.max)
is "$actual2" "500000 100000" "resource limits set properly"
run_podman --cgroup-manager=cgroupfs pod rm $name2
}
# vim: filetype=sh

View file

@ -1,3 +1,6 @@
//go:build !linux
// +build !linux
package cgroups
import (

View file

@ -0,0 +1,161 @@
//go:build linux
// +build linux
package cgroups
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
)
type linuxBlkioHandler struct {
Blkio fs.BlkioGroup
}
func getBlkioHandler() *linuxBlkioHandler {
return &linuxBlkioHandler{}
}
// Apply set the specified constraints
func (c *linuxBlkioHandler) Apply(ctr *CgroupControl, res *configs.Resources) error {
if ctr.cgroup2 {
man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path))
if err != nil {
return err
}
return man.Set(res)
}
path := filepath.Join(cgroupRoot, Blkio, ctr.config.Path)
return c.Blkio.Set(path, res)
}
// Create the cgroup
func (c *linuxBlkioHandler) Create(ctr *CgroupControl) (bool, error) {
if ctr.cgroup2 {
return false, nil
}
return ctr.createCgroupDirectory(Blkio)
}
// Destroy the cgroup
func (c *linuxBlkioHandler) Destroy(ctr *CgroupControl) error {
return rmDirRecursively(ctr.getCgroupv1Path(Blkio))
}
// Stat fills a metrics structure with usage stats for the controller
func (c *linuxBlkioHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error {
var ioServiceBytesRecursive []cgroups.BlkioStatEntry
if ctr.cgroup2 {
// more details on the io.stat file format:X https://facebookmicrosites.github.io/cgroup2/docs/io-controller.html
values, err := readCgroup2MapFile(ctr, "io.stat")
if err != nil {
return err
}
for k, v := range values {
d := strings.Split(k, ":")
if len(d) != 2 {
continue
}
minor, err := strconv.ParseUint(d[0], 10, 0)
if err != nil {
return err
}
major, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
for _, item := range v {
d := strings.Split(item, "=")
if len(d) != 2 {
continue
}
op := d[0]
// Accommodate the cgroup v1 naming
switch op {
case "rbytes":
op = "read"
case "wbytes":
op = "write"
}
value, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
entry := cgroups.BlkioStatEntry{
Op: op,
Major: major,
Minor: minor,
Value: value,
}
ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
}
}
} else {
BlkioRoot := ctr.getCgroupv1Path(Blkio)
p := filepath.Join(BlkioRoot, "blkio.throttle.io_service_bytes_recursive")
f, err := os.Open(p)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return errors.Wrapf(err, "open %s", p)
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Fields(line)
if len(parts) < 3 {
continue
}
d := strings.Split(parts[0], ":")
if len(d) != 2 {
continue
}
minor, err := strconv.ParseUint(d[0], 10, 0)
if err != nil {
return err
}
major, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
op := parts[1]
value, err := strconv.ParseUint(parts[2], 10, 0)
if err != nil {
return err
}
entry := cgroups.BlkioStatEntry{
Op: op,
Major: major,
Minor: minor,
Value: value,
}
ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
}
if err := scanner.Err(); err != nil {
return errors.Wrapf(err, "parse %s", p)
}
}
m.BlkioStats.IoServiceBytesRecursive = ioServiceBytesRecursive
return nil
}

View file

@ -1,8 +1,10 @@
//go:build !linux
// +build !linux
package cgroups
import (
"bufio"
"bytes"
"context"
"fmt"
"io/ioutil"
@ -248,47 +250,6 @@ func (c *CgroupControl) getCgroupv1Path(name string) string {
return filepath.Join(cgroupRoot, name, c.path)
}
// createCgroupv2Path creates the cgroupv2 path and enables all the available controllers
func createCgroupv2Path(path string) (deferredError error) {
if !strings.HasPrefix(path, cgroupRoot+"/") {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := ioutil.ReadFile(cgroupRoot + "/cgroup.controllers")
if err != nil {
return err
}
ctrs := bytes.Fields(content)
res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...)
current := "/sys/fs"
elements := strings.Split(path, "/")
for i, e := range elements[3:] {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0o755); err != nil {
if !os.IsExist(err) {
return err
}
} else {
// If the directory was created, be sure it is not left around on errors.
defer func() {
if deferredError != nil {
os.Remove(current)
}
}()
}
}
// We enable the controllers for all the path components except the last one. It is not allowed to add
// PIDs if there are already enabled controllers.
if i < len(elements[3:])-1 {
if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), res, 0o755); err != nil {
return err
}
}
}
return nil
}
// initialize initializes the specified hierarchy
func (c *CgroupControl) initialize() (err error) {
createdSoFar := map[string]controllerHandler{}
@ -332,23 +293,6 @@ func (c *CgroupControl) initialize() (err error) {
return nil
}
func (c *CgroupControl) createCgroupDirectory(controller string) (bool, error) {
cPath := c.getCgroupv1Path(controller)
_, err := os.Stat(cPath)
if err == nil {
return false, nil
}
if !os.IsNotExist(err) {
return false, err
}
if err := os.MkdirAll(cPath, 0o755); err != nil {
return false, errors.Wrapf(err, "error creating cgroup for %s", controller)
}
return true, nil
}
func readFileAsUint64(path string) (uint64, error) {
data, err := ioutil.ReadFile(path)
if err != nil {

View file

@ -0,0 +1,575 @@
//go:build linux
// +build linux
package cgroups
import (
"bufio"
"context"
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/containers/storage/pkg/unshare"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
var (
// ErrCgroupDeleted means the cgroup was deleted
ErrCgroupDeleted = errors.New("cgroup deleted")
// ErrCgroupV1Rootless means the cgroup v1 were attempted to be used in rootless environment
ErrCgroupV1Rootless = errors.New("no support for CGroups V1 in rootless environments")
ErrStatCgroup = errors.New("no cgroup available for gathering user statistics")
)
// CgroupControl controls a cgroup hierarchy
type CgroupControl struct {
cgroup2 bool
config *configs.Cgroup
systemd bool
// List of additional cgroup subsystems joined that
// do not have a custom handler.
additionalControllers []controller
}
type controller struct {
name string
symlink bool
}
type controllerHandler interface {
Create(*CgroupControl) (bool, error)
Apply(*CgroupControl, *configs.Resources) error
Destroy(*CgroupControl) error
Stat(*CgroupControl, *cgroups.Stats) error
}
const (
cgroupRoot = "/sys/fs/cgroup"
// CPU is the cpu controller
CPU = "cpu"
// CPUAcct is the cpuacct controller
CPUAcct = "cpuacct"
// CPUset is the cpuset controller
CPUset = "cpuset"
// Memory is the memory controller
Memory = "memory"
// Pids is the pids controller
Pids = "pids"
// Blkio is the blkio controller
Blkio = "blkio"
)
var handlers map[string]controllerHandler
func init() {
handlers = make(map[string]controllerHandler)
handlers[CPU] = getCPUHandler()
handlers[CPUset] = getCpusetHandler()
handlers[Memory] = getMemoryHandler()
handlers[Pids] = getPidsHandler()
handlers[Blkio] = getBlkioHandler()
}
// getAvailableControllers get the available controllers
func getAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]controller, error) {
if cgroup2 {
controllers := []controller{}
controllersFile := cgroupRoot + "/cgroup.controllers"
// rootless cgroupv2: check available controllers for current user, systemd or servicescope will inherit
if unshare.IsRootless() {
userSlice, err := getCgroupPathForCurrentProcess()
if err != nil {
return controllers, err
}
// userSlice already contains '/' so not adding here
basePath := cgroupRoot + userSlice
controllersFile = fmt.Sprintf("%s/cgroup.controllers", basePath)
}
controllersFileBytes, err := ioutil.ReadFile(controllersFile)
if err != nil {
return nil, errors.Wrapf(err, "failed while reading controllers for cgroup v2 from %q", controllersFile)
}
for _, controllerName := range strings.Fields(string(controllersFileBytes)) {
c := controller{
name: controllerName,
symlink: false,
}
controllers = append(controllers, c)
}
return controllers, nil
}
subsystems, _ := cgroupV1GetAllSubsystems()
controllers := []controller{}
// cgroupv1 and rootless: No subsystem is available: delegation is unsafe.
if unshare.IsRootless() {
return controllers, nil
}
for _, name := range subsystems {
if _, found := exclude[name]; found {
continue
}
fileInfo, err := os.Stat(cgroupRoot + "/" + name)
if err != nil {
continue
}
c := controller{
name: name,
symlink: !fileInfo.IsDir(),
}
controllers = append(controllers, c)
}
return controllers, nil
}
// GetAvailableControllers get string:bool map of all the available controllers
func GetAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]string, error) {
availableControllers, err := getAvailableControllers(exclude, cgroup2)
if err != nil {
return nil, err
}
controllerList := []string{}
for _, controller := range availableControllers {
controllerList = append(controllerList, controller.name)
}
return controllerList, nil
}
func cgroupV1GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups")
if err != nil {
return nil, err
}
defer f.Close()
subsystems := []string{}
s := bufio.NewScanner(f)
for s.Scan() {
text := s.Text()
if text[0] != '#' {
parts := strings.Fields(text)
if len(parts) >= 4 && parts[3] != "0" {
subsystems = append(subsystems, parts[0])
}
}
}
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil
}
func getCgroupPathForCurrentProcess() (string, error) {
path := fmt.Sprintf("/proc/%d/cgroup", os.Getpid())
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
cgroupPath := ""
s := bufio.NewScanner(f)
for s.Scan() {
text := s.Text()
procEntries := strings.SplitN(text, "::", 2)
// set process cgroupPath only if entry is valid
if len(procEntries) > 1 {
cgroupPath = procEntries[1]
}
}
if err := s.Err(); err != nil {
return cgroupPath, err
}
return cgroupPath, nil
}
// getCgroupv1Path is a helper function to get the cgroup v1 path
func (c *CgroupControl) getCgroupv1Path(name string) string {
return filepath.Join(cgroupRoot, name, c.config.Path)
}
// initialize initializes the specified hierarchy
func (c *CgroupControl) initialize() (err error) {
createdSoFar := map[string]controllerHandler{}
defer func() {
if err != nil {
for name, ctr := range createdSoFar {
if err := ctr.Destroy(c); err != nil {
logrus.Warningf("error cleaning up controller %s for %s", name, c.config.Path)
}
}
}
}()
if c.cgroup2 {
if err := createCgroupv2Path(filepath.Join(cgroupRoot, c.config.Path)); err != nil {
return errors.Wrapf(err, "error creating cgroup path %s", c.config.Path)
}
}
for name, handler := range handlers {
created, err := handler.Create(c)
if err != nil {
return err
}
if created {
createdSoFar[name] = handler
}
}
if !c.cgroup2 {
// We won't need to do this for cgroup v2
for _, ctr := range c.additionalControllers {
if ctr.symlink {
continue
}
path := c.getCgroupv1Path(ctr.name)
if err := os.MkdirAll(path, 0o755); err != nil {
return errors.Wrapf(err, "error creating cgroup path for %s", ctr.name)
}
}
}
return nil
}
func readFileAsUint64(path string) (uint64, error) {
data, err := ioutil.ReadFile(path)
if err != nil {
return 0, err
}
v := cleanString(string(data))
if v == "max" {
return math.MaxUint64, nil
}
ret, err := strconv.ParseUint(v, 10, 64)
if err != nil {
return ret, errors.Wrapf(err, "parse %s from %s", v, path)
}
return ret, nil
}
func readFileByKeyAsUint64(path, key string) (uint64, error) {
content, err := ioutil.ReadFile(path)
if err != nil {
return 0, err
}
for _, line := range strings.Split(string(content), "\n") {
fields := strings.SplitN(line, " ", 2)
if fields[0] == key {
v := cleanString(string(fields[1]))
if v == "max" {
return math.MaxUint64, nil
}
ret, err := strconv.ParseUint(v, 10, 64)
if err != nil {
return ret, errors.Wrapf(err, "parse %s from %s", v, path)
}
return ret, nil
}
}
return 0, fmt.Errorf("no key named %s from %s", key, path)
}
// New creates a new cgroup control
func New(path string, resources *configs.Resources) (*CgroupControl, error) {
cgroup2, err := IsCgroup2UnifiedMode()
if err != nil {
return nil, err
}
control := &CgroupControl{
cgroup2: cgroup2,
config: &configs.Cgroup{
Path: path,
Resources: resources,
},
}
if !cgroup2 {
controllers, err := getAvailableControllers(handlers, false)
if err != nil {
return nil, err
}
control.additionalControllers = controllers
}
if err := control.initialize(); err != nil {
return nil, err
}
return control, nil
}
// NewSystemd creates a new cgroup control
func NewSystemd(path string, resources *configs.Resources) (*CgroupControl, error) {
cgroup2, err := IsCgroup2UnifiedMode()
if err != nil {
return nil, err
}
control := &CgroupControl{
cgroup2: cgroup2,
systemd: true,
config: &configs.Cgroup{
Path: path,
Resources: resources,
Rootless: unshare.IsRootless(),
},
}
return control, nil
}
// Load loads an existing cgroup control
func Load(path string) (*CgroupControl, error) {
cgroup2, err := IsCgroup2UnifiedMode()
if err != nil {
return nil, err
}
control := &CgroupControl{
cgroup2: cgroup2,
systemd: false,
config: &configs.Cgroup{
Path: path,
},
}
if !cgroup2 {
controllers, err := getAvailableControllers(handlers, false)
if err != nil {
return nil, err
}
control.additionalControllers = controllers
}
if !cgroup2 {
oneExists := false
// check that the cgroup exists at least under one controller
for name := range handlers {
p := control.getCgroupv1Path(name)
if _, err := os.Stat(p); err == nil {
oneExists = true
break
}
}
// if there is no controller at all, raise an error
if !oneExists {
if unshare.IsRootless() {
return nil, ErrCgroupV1Rootless
}
// compatible with the error code
// used by containerd/cgroups
return nil, ErrCgroupDeleted
}
}
return control, nil
}
// CreateSystemdUnit creates the systemd cgroup
func (c *CgroupControl) CreateSystemdUnit(path string) error {
if !c.systemd {
return fmt.Errorf("the cgroup controller is not using systemd")
}
conn, err := systemdDbus.NewWithContext(context.TODO())
if err != nil {
return err
}
defer conn.Close()
return systemdCreate(c.config.Resources, path, conn)
}
// GetUserConnection returns an user connection to D-BUS
func GetUserConnection(uid int) (*systemdDbus.Conn, error) {
return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
return dbusAuthConnection(uid, dbus.SessionBusPrivate)
})
}
// CreateSystemdUserUnit creates the systemd cgroup for the specified user
func (c *CgroupControl) CreateSystemdUserUnit(path string, uid int) error {
if !c.systemd {
return fmt.Errorf("the cgroup controller is not using systemd")
}
conn, err := GetUserConnection(uid)
if err != nil {
return err
}
defer conn.Close()
return systemdCreate(c.config.Resources, path, conn)
}
func dbusAuthConnection(uid int, createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) {
conn, err := createBus()
if err != nil {
return nil, err
}
methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
err = conn.Auth(methods)
if err != nil {
conn.Close()
return nil, err
}
if err := conn.Hello(); err != nil {
return nil, err
}
return conn, nil
}
// Delete cleans a cgroup
func (c *CgroupControl) Delete() error {
return c.DeleteByPath(c.config.Path)
}
// DeleteByPathConn deletes the specified cgroup path using the specified
// dbus connection if needed.
func (c *CgroupControl) DeleteByPathConn(path string, conn *systemdDbus.Conn) error {
if c.systemd {
return systemdDestroyConn(path, conn)
}
if c.cgroup2 {
return rmDirRecursively(filepath.Join(cgroupRoot, c.config.Path))
}
var lastError error
for _, h := range handlers {
if err := h.Destroy(c); err != nil {
lastError = err
}
}
for _, ctr := range c.additionalControllers {
if ctr.symlink {
continue
}
p := c.getCgroupv1Path(ctr.name)
if err := rmDirRecursively(p); err != nil {
lastError = errors.Wrapf(err, "remove %s", p)
}
}
return lastError
}
// DeleteByPath deletes the specified cgroup path
func (c *CgroupControl) DeleteByPath(path string) error {
if c.systemd {
conn, err := systemdDbus.NewWithContext(context.TODO())
if err != nil {
return err
}
defer conn.Close()
return c.DeleteByPathConn(path, conn)
}
return c.DeleteByPathConn(path, nil)
}
// Update updates the cgroups
func (c *CgroupControl) Update(resources *configs.Resources) error {
for _, h := range handlers {
if err := h.Apply(c, resources); err != nil {
return err
}
}
return nil
}
// AddPid moves the specified pid to the cgroup
func (c *CgroupControl) AddPid(pid int) error {
pidString := []byte(fmt.Sprintf("%d\n", pid))
if c.cgroup2 {
path := filepath.Join(cgroupRoot, c.config.Path)
return fs2.CreateCgroupPath(path, c.config)
}
names := make([]string, 0, len(handlers))
for n := range handlers {
names = append(names, n)
}
for _, c := range c.additionalControllers {
if !c.symlink {
names = append(names, c.name)
}
}
for _, n := range names {
// If we aren't using cgroup2, we won't write correctly to unified hierarchy
if !c.cgroup2 && n == "unified" {
continue
}
p := filepath.Join(c.getCgroupv1Path(n), "tasks")
if err := ioutil.WriteFile(p, pidString, 0o644); err != nil {
return errors.Wrapf(err, "write %s", p)
}
}
return nil
}
// Stat returns usage statistics for the cgroup
func (c *CgroupControl) Stat() (*cgroups.Stats, error) {
m := cgroups.Stats{}
found := false
for _, h := range handlers {
if err := h.Stat(c, &m); err != nil {
if !os.IsNotExist(errors.Cause(err)) {
return nil, err
}
logrus.Warningf("Failed to retrieve cgroup stats: %v", err)
continue
}
found = true
}
if !found {
return nil, ErrStatCgroup
}
return &m, nil
}
func readCgroup2MapPath(path string) (map[string][]string, error) {
ret := map[string][]string{}
f, err := os.Open(path)
if err != nil {
if os.IsNotExist(err) {
return ret, nil
}
return nil, errors.Wrapf(err, "open file %s", path)
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Fields(line)
if len(parts) < 2 {
continue
}
ret[parts[0]] = parts[1:]
}
if err := scanner.Err(); err != nil {
return nil, errors.Wrapf(err, "parsing file %s", path)
}
return ret, nil
}
func readCgroup2MapFile(ctr *CgroupControl, name string) (map[string][]string, error) {
p := filepath.Join(cgroupRoot, ctr.config.Path, name)
return readCgroup2MapPath(p)
}

View file

@ -1,12 +1,12 @@
//go:build !linux
// +build !linux
package cgroups
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
@ -18,36 +18,6 @@ func getCPUHandler() *cpuHandler {
return &cpuHandler{}
}
func cleanString(s string) string {
return strings.Trim(s, "\n")
}
func readAcct(ctr *CgroupControl, name string) (uint64, error) {
p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name)
return readFileAsUint64(p)
}
func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) {
p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name)
data, err := ioutil.ReadFile(p)
if err != nil {
return nil, errors.Wrapf(err, "reading %s", p)
}
r := []uint64{}
for _, s := range strings.Split(string(data), " ") {
s = cleanString(s)
if s == "" {
break
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return nil, errors.Wrapf(err, "parsing %s", s)
}
r = append(r, v)
}
return r, nil
}
// Apply set the specified constraints
func (c *cpuHandler) Apply(ctr *CgroupControl, res *spec.LinuxResources) error {
if res.CPU == nil {
@ -119,41 +89,3 @@ func (c *cpuHandler) Stat(ctr *CgroupControl, m *Metrics) error {
m.CPU = CPUMetrics{Usage: usage}
return nil
}
// GetSystemCPUUsage returns the system usage for all the cgroups
func GetSystemCPUUsage() (uint64, error) {
cgroupv2, err := IsCgroup2UnifiedMode()
if err != nil {
return 0, err
}
if !cgroupv2 {
p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage")
return readFileAsUint64(p)
}
files, err := ioutil.ReadDir(cgroupRoot)
if err != nil {
return 0, err
}
var total uint64
for _, file := range files {
if !file.IsDir() {
continue
}
p := filepath.Join(cgroupRoot, file.Name(), "cpu.stat")
values, err := readCgroup2MapPath(p)
if err != nil {
return 0, err
}
if val, found := values["usage_usec"]; found {
v, err := strconv.ParseUint(cleanString(val[0]), 10, 64)
if err != nil {
return 0, err
}
total += v * 1000
}
}
return total, nil
}

View file

@ -0,0 +1,100 @@
//go:build linux
// +build linux
package cgroups
import (
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
)
type linuxCPUHandler struct {
CPU fs.CpuGroup
}
func getCPUHandler() *linuxCPUHandler {
return &linuxCPUHandler{}
}
// Apply set the specified constraints
func (c *linuxCPUHandler) Apply(ctr *CgroupControl, res *configs.Resources) error {
if ctr.cgroup2 {
man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path))
if err != nil {
return err
}
return man.Set(res)
}
path := filepath.Join(cgroupRoot, CPU, ctr.config.Path)
return c.CPU.Set(path, res)
}
// Create the cgroup
func (c *linuxCPUHandler) Create(ctr *CgroupControl) (bool, error) {
if ctr.cgroup2 {
return false, nil
}
return ctr.createCgroupDirectory(CPU)
}
// Destroy the cgroup
func (c *linuxCPUHandler) Destroy(ctr *CgroupControl) error {
return rmDirRecursively(ctr.getCgroupv1Path(CPU))
}
// Stat fills a metrics structure with usage stats for the controller
func (c *linuxCPUHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error {
var err error
cpu := cgroups.CpuStats{}
if ctr.cgroup2 {
values, err := readCgroup2MapFile(ctr, "cpu.stat")
if err != nil {
return err
}
if val, found := values["usage_usec"]; found {
cpu.CpuUsage.TotalUsage, err = strconv.ParseUint(cleanString(val[0]), 10, 64)
if err != nil {
return err
}
cpu.CpuUsage.UsageInKernelmode *= 1000
}
if val, found := values["system_usec"]; found {
cpu.CpuUsage.UsageInKernelmode, err = strconv.ParseUint(cleanString(val[0]), 10, 64)
if err != nil {
return err
}
cpu.CpuUsage.TotalUsage *= 1000
}
} else {
cpu.CpuUsage.TotalUsage, err = readAcct(ctr, "cpuacct.usage")
if err != nil {
if !os.IsNotExist(errors.Cause(err)) {
return err
}
cpu.CpuUsage.TotalUsage = 0
}
cpu.CpuUsage.UsageInKernelmode, err = readAcct(ctr, "cpuacct.usage_sys")
if err != nil {
if !os.IsNotExist(errors.Cause(err)) {
return err
}
cpu.CpuUsage.UsageInKernelmode = 0
}
cpu.CpuUsage.PercpuUsage, err = readAcctList(ctr, "cpuacct.usage_percpu")
if err != nil {
if !os.IsNotExist(errors.Cause(err)) {
return err
}
cpu.CpuUsage.PercpuUsage = nil
}
}
m.CpuStats = cpu
return nil
}

View file

@ -1,52 +1,17 @@
//go:build !linux
// +build !linux
package cgroups
import (
"fmt"
"io/ioutil"
"path/filepath"
"strings"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
)
type cpusetHandler struct{}
func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) {
if dir == cgroupRoot {
return nil, fmt.Errorf("could not find parent to initialize cpuset %s", file)
}
path := filepath.Join(dir, file)
parentPath := path
if cgroupv2 {
parentPath = fmt.Sprintf("%s.effective", parentPath)
}
data, err := ioutil.ReadFile(parentPath)
if err != nil {
return nil, errors.Wrapf(err, "open %s", path)
}
if strings.Trim(string(data), "\n") != "" {
return data, nil
}
data, err = cpusetCopyFileFromParent(filepath.Dir(dir), file, cgroupv2)
if err != nil {
return nil, err
}
if err := ioutil.WriteFile(path, data, 0o644); err != nil {
return nil, errors.Wrapf(err, "write %s", path)
}
return data, nil
}
func cpusetCopyFromParent(path string, cgroupv2 bool) error {
for _, file := range []string{"cpuset.cpus", "cpuset.mems"} {
if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil {
return err
}
}
return nil
}
func getCpusetHandler() *cpusetHandler {
return &cpusetHandler{}
}

View file

@ -0,0 +1,57 @@
//go:build linux
// +build linux
package cgroups
import (
"path/filepath"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)
type linuxCpusetHandler struct {
CPUSet fs.CpusetGroup
}
func getCpusetHandler() *linuxCpusetHandler {
return &linuxCpusetHandler{}
}
// Apply set the specified constraints
func (c *linuxCpusetHandler) Apply(ctr *CgroupControl, res *configs.Resources) error {
if ctr.cgroup2 {
man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path))
if err != nil {
return err
}
return man.Set(res)
}
path := filepath.Join(cgroupRoot, CPUset, ctr.config.Path)
return c.CPUSet.Set(path, res)
}
// Create the cgroup
func (c *linuxCpusetHandler) Create(ctr *CgroupControl) (bool, error) {
if ctr.cgroup2 {
path := filepath.Join(cgroupRoot, ctr.config.Path)
return true, cpusetCopyFromParent(path, true)
}
created, err := ctr.createCgroupDirectory(CPUset)
if !created || err != nil {
return created, err
}
return true, cpusetCopyFromParent(ctr.getCgroupv1Path(CPUset), false)
}
// Destroy the cgroup
func (c *linuxCpusetHandler) Destroy(ctr *CgroupControl) error {
return rmDirRecursively(ctr.getCgroupv1Path(CPUset))
}
// Stat fills a metrics structure with usage stats for the controller
func (c *linuxCpusetHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error {
return nil
}

View file

@ -1,3 +1,6 @@
//go:build !linux
// +build !linux
package cgroups
import (

View file

@ -0,0 +1,78 @@
//go:build linux
// +build linux
package cgroups
import (
"path/filepath"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)
type linuxMemHandler struct {
Mem fs.MemoryGroup
}
func getMemoryHandler() *linuxMemHandler {
return &linuxMemHandler{}
}
// Apply set the specified constraints
func (c *linuxMemHandler) Apply(ctr *CgroupControl, res *configs.Resources) error {
if ctr.cgroup2 {
man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path))
if err != nil {
return err
}
return man.Set(res)
}
path := filepath.Join(cgroupRoot, Memory, ctr.config.Path)
return c.Mem.Set(path, res)
}
// Create the cgroup
func (c *linuxMemHandler) Create(ctr *CgroupControl) (bool, error) {
if ctr.cgroup2 {
return false, nil
}
return ctr.createCgroupDirectory(Memory)
}
// Destroy the cgroup
func (c *linuxMemHandler) Destroy(ctr *CgroupControl) error {
return rmDirRecursively(ctr.getCgroupv1Path(Memory))
}
// Stat fills a metrics structure with usage stats for the controller
func (c *linuxMemHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error {
var err error
memUsage := cgroups.MemoryStats{}
var memoryRoot string
var limitFilename string
if ctr.cgroup2 {
memoryRoot = filepath.Join(cgroupRoot, ctr.config.Path)
limitFilename = "memory.max"
if memUsage.Usage.Usage, err = readFileByKeyAsUint64(filepath.Join(memoryRoot, "memory.stat"), "anon"); err != nil {
return err
}
} else {
memoryRoot = ctr.getCgroupv1Path(Memory)
limitFilename = "memory.limit_in_bytes"
if memUsage.Usage.Usage, err = readFileAsUint64(filepath.Join(memoryRoot, "memory.usage_in_bytes")); err != nil {
return err
}
}
memUsage.Usage.Limit, err = readFileAsUint64(filepath.Join(memoryRoot, limitFilename))
if err != nil {
return err
}
m.MemoryStats = memUsage
return nil
}

View file

@ -1,3 +1,6 @@
//go:build !linux
// +build !linux
package cgroups
import (

View file

@ -0,0 +1,71 @@
//go:build linux
// +build linux
package cgroups
import (
"path/filepath"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)
type linuxPidHandler struct {
Pid fs.PidsGroup
}
func getPidsHandler() *linuxPidHandler {
return &linuxPidHandler{}
}
// Apply set the specified constraints
func (c *linuxPidHandler) Apply(ctr *CgroupControl, res *configs.Resources) error {
if ctr.cgroup2 {
man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path))
if err != nil {
return err
}
return man.Set(res)
}
path := filepath.Join(cgroupRoot, Pids, ctr.config.Path)
return c.Pid.Set(path, res)
}
// Create the cgroup
func (c *linuxPidHandler) Create(ctr *CgroupControl) (bool, error) {
if ctr.cgroup2 {
return false, nil
}
return ctr.createCgroupDirectory(Pids)
}
// Destroy the cgroup
func (c *linuxPidHandler) Destroy(ctr *CgroupControl) error {
return rmDirRecursively(ctr.getCgroupv1Path(Pids))
}
// Stat fills a metrics structure with usage stats for the controller
func (c *linuxPidHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error {
if ctr.config.Path == "" {
// nothing we can do to retrieve the pids.current path
return nil
}
var PIDRoot string
if ctr.cgroup2 {
PIDRoot = filepath.Join(cgroupRoot, ctr.config.Path)
} else {
PIDRoot = ctr.getCgroupv1Path(Pids)
}
current, err := readFileAsUint64(filepath.Join(PIDRoot, "pids.current"))
if err != nil {
return err
}
m.PidsStats.Current = current
return nil
}

View file

@ -1,3 +1,6 @@
//go:build !linux
// +build !linux
package cgroups
import (
@ -52,15 +55,11 @@ func systemdCreate(path string, c *systemdDbus.Conn) error {
/*
systemdDestroyConn is copied from containerd/cgroups/systemd.go file, that
has the following license:
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

View file

@ -0,0 +1,167 @@
//go:build linux
// +build linux
package cgroups
import (
"context"
"fmt"
"path/filepath"
"strings"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/configs"
)
func systemdCreate(resources *configs.Resources, path string, c *systemdDbus.Conn) error {
slice, name := filepath.Split(path)
slice = strings.TrimSuffix(slice, "/")
var lastError error
for i := 0; i < 2; i++ {
properties := []systemdDbus.Property{
systemdDbus.PropDescription(fmt.Sprintf("cgroup %s", name)),
systemdDbus.PropWants(slice),
}
pMap := map[string]bool{
"DefaultDependencies": false,
"MemoryAccounting": true,
"CPUAccounting": true,
"BlockIOAccounting": true,
}
if i == 0 {
pMap["Delegate"] = true
}
for k, v := range pMap {
p := systemdDbus.Property{
Name: k,
Value: dbus.MakeVariant(v),
}
properties = append(properties, p)
}
uMap, sMap, bMap, iMap := resourcesToProps(resources)
for k, v := range uMap {
p := systemdDbus.Property{
Name: k,
Value: dbus.MakeVariant(v),
}
properties = append(properties, p)
}
for k, v := range sMap {
p := systemdDbus.Property{
Name: k,
Value: dbus.MakeVariant(v),
}
properties = append(properties, p)
}
for k, v := range bMap {
p := systemdDbus.Property{
Name: k,
Value: dbus.MakeVariant(v),
}
properties = append(properties, p)
}
for k, v := range iMap {
p := systemdDbus.Property{
Name: k,
Value: dbus.MakeVariant(v),
}
properties = append(properties, p)
}
ch := make(chan string)
_, err := c.StartTransientUnitContext(context.TODO(), name, "replace", properties, ch)
if err != nil {
lastError = err
continue
}
<-ch
return nil
}
return lastError
}
/*
systemdDestroyConn is copied from containerd/cgroups/systemd.go file, that
has the following license:
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
func systemdDestroyConn(path string, c *systemdDbus.Conn) error {
name := filepath.Base(path)
ch := make(chan string)
_, err := c.StopUnitContext(context.TODO(), name, "replace", ch)
if err != nil {
return err
}
<-ch
return nil
}
func resourcesToProps(res *configs.Resources) (map[string]uint64, map[string]string, map[string][]byte, map[string]int64) {
bMap := make(map[string][]byte)
// this array is not used but will be once more resource limits are added
sMap := make(map[string]string)
iMap := make(map[string]int64)
uMap := make(map[string]uint64)
// CPU
if res.CpuPeriod != 0 {
uMap["CPUQuotaPeriodUSec"] = res.CpuPeriod
}
if res.CpuQuota != 0 {
period := res.CpuPeriod
if period == 0 {
period = uint64(100000)
}
cpuQuotaPerSecUSec := uint64(res.CpuQuota*1000000) / period
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
uMap["CPUQuotaPerSecUSec"] = cpuQuotaPerSecUSec
}
// CPUSet
if res.CpusetCpus != "" {
bits := []byte(res.CpusetCpus)
bMap["AllowedCPUs"] = bits
}
if res.CpusetMems != "" {
bits := []byte(res.CpusetMems)
bMap["AllowedMemoryNodes"] = bits
}
// Mem
if res.Memory != 0 {
iMap["MemoryMax"] = res.Memory
}
if res.MemorySwap != 0 {
iMap["MemorySwapMax"] = res.MemorySwap
}
// Blkio
if res.BlkioWeight > 0 {
uMap["BlockIOWeight"] = uint64(res.BlkioWeight)
}
return uMap, sMap, bMap, iMap
}

View file

@ -0,0 +1,176 @@
package cgroups
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/pkg/errors"
)
var TestMode bool
func cleanString(s string) string {
return strings.Trim(s, "\n")
}
func readAcct(ctr *CgroupControl, name string) (uint64, error) {
p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name)
return readFileAsUint64(p)
}
func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) {
p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name)
data, err := ioutil.ReadFile(p)
if err != nil {
return nil, errors.Wrapf(err, "reading %s", p)
}
r := []uint64{}
for _, s := range strings.Split(string(data), " ") {
s = cleanString(s)
if s == "" {
break
}
v, err := strconv.ParseUint(s, 10, 64)
if err != nil {
return nil, errors.Wrapf(err, "parsing %s", s)
}
r = append(r, v)
}
return r, nil
}
// GetSystemCPUUsage returns the system usage for all the cgroups
func GetSystemCPUUsage() (uint64, error) {
cgroupv2, err := IsCgroup2UnifiedMode()
if err != nil {
return 0, err
}
if !cgroupv2 {
p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage")
return readFileAsUint64(p)
}
files, err := ioutil.ReadDir(cgroupRoot)
if err != nil {
return 0, err
}
var total uint64
for _, file := range files {
if !file.IsDir() {
continue
}
p := filepath.Join(cgroupRoot, file.Name(), "cpu.stat")
values, err := readCgroup2MapPath(p)
if err != nil {
return 0, err
}
if val, found := values["usage_usec"]; found {
v, err := strconv.ParseUint(cleanString(val[0]), 10, 64)
if err != nil {
return 0, err
}
total += v * 1000
}
}
return total, nil
}
func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) {
if dir == cgroupRoot {
return nil, fmt.Errorf("could not find parent to initialize cpuset %s", file)
}
path := filepath.Join(dir, file)
parentPath := path
if cgroupv2 {
parentPath = fmt.Sprintf("%s.effective", parentPath)
}
data, err := ioutil.ReadFile(parentPath)
if err != nil {
return nil, errors.Wrapf(err, "open %s", path)
}
if strings.Trim(string(data), "\n") != "" {
return data, nil
}
data, err = cpusetCopyFileFromParent(filepath.Dir(dir), file, cgroupv2)
if err != nil {
return nil, err
}
if err := ioutil.WriteFile(path, data, 0o644); err != nil {
return nil, errors.Wrapf(err, "write %s", path)
}
return data, nil
}
func cpusetCopyFromParent(path string, cgroupv2 bool) error {
for _, file := range []string{"cpuset.cpus", "cpuset.mems"} {
if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil {
return err
}
}
return nil
}
// createCgroupv2Path creates the cgroupv2 path and enables all the available controllers
func createCgroupv2Path(path string) (deferredError error) {
if !strings.HasPrefix(path, cgroupRoot+"/") {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := ioutil.ReadFile(cgroupRoot + "/cgroup.controllers")
if err != nil {
return err
}
ctrs := bytes.Fields(content)
res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...)
current := "/sys/fs"
elements := strings.Split(path, "/")
for i, e := range elements[3:] {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0o755); err != nil {
if !os.IsExist(err) {
return err
}
} else {
// If the directory was created, be sure it is not left around on errors.
defer func() {
if deferredError != nil {
os.Remove(current)
}
}()
}
}
// We enable the controllers for all the path components except the last one. It is not allowed to add
// PIDs if there are already enabled controllers.
if i < len(elements[3:])-1 {
if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), res, 0o755); err != nil {
return err
}
}
}
return nil
}
func (c *CgroupControl) createCgroupDirectory(controller string) (bool, error) {
cPath := c.getCgroupv1Path(controller)
_, err := os.Stat(cPath)
if err == nil {
return false, nil
}
if !os.IsNotExist(err) {
return false, err
}
if err := os.MkdirAll(cPath, 0o755); err != nil {
return false, errors.Wrapf(err, "error creating cgroup for %s", controller)
}
return true, nil
}

View file

@ -0,0 +1,146 @@
//go:build linux
// +build linux
package cgroups
import (
"bytes"
"fmt"
"os"
"path"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// WriteFile writes to a cgroup file
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
// OpenFile opens a cgroup file with the given flags
func OpenFile(dir, file string, flags int) (*os.File, error) {
var resolveFlags uint64
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
cgroupPath := path.Join(dir, file)
relPath := strings.TrimPrefix(cgroupPath, cgroupRoot+"/")
var stats unix.Statfs_t
fdTest, errOpen := unix.Openat2(-1, cgroupRoot, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
})
errStat := unix.Fstatfs(fdTest, &stats)
cgroupFd := fdTest
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if stats.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
if errOpen != nil || errStat != nil || (len(relPath) == len(cgroupPath)) { // openat2 not available, use os
fdTest, err := os.OpenFile(cgroupPath, flags, mode)
if err != nil {
return nil, err
}
if TestMode {
return fdTest, nil
}
if err := unix.Fstatfs(int(fdTest.Fd()), &stats); err != nil {
_ = fdTest.Close()
return nil, &os.PathError{Op: "statfs", Path: cgroupPath, Err: err}
}
if stats.Type != unix.CGROUP_SUPER_MAGIC && stats.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fdTest.Close()
return nil, &os.PathError{Op: "open", Path: cgroupPath, Err: errors.New("not a cgroup file")}
}
return fdTest, nil
}
fd, err := unix.Openat2(cgroupFd, relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
fmt.Println("Error in openat")
return nil, err
}
return os.NewFile(uintptr(fd), cgroupPath), nil
}
// ReadFile reads from a cgroup file, opening it with the read only flag
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
// GetBlkioFiles gets the proper files for blkio weights
func GetBlkioFiles(cgroupPath string) (wtFile, wtDevFile string) {
var weightFile string
var weightDeviceFile string
// in this important since runc keeps these variables private, they won't be set
if cgroups.PathExists(filepath.Join(cgroupPath, "blkio.weight")) {
weightFile = "blkio.weight"
weightDeviceFile = "blkio.weight_device"
} else {
weightFile = "blkio.bfq.weight"
weightDeviceFile = "blkio.bfq.weight_device"
}
return weightFile, weightDeviceFile
}
// SetBlkioThrottle sets the throttle limits for the cgroup
func SetBlkioThrottle(res *configs.Resources, cgroupPath string) error {
for _, td := range res.BlkioThrottleReadBpsDevice {
if err := WriteFile(cgroupPath, "blkio.throttle.read_bps_device", fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)); err != nil {
return err
}
}
for _, td := range res.BlkioThrottleWriteBpsDevice {
if err := WriteFile(cgroupPath, "blkio.throttle.write_bps_device", fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)); err != nil {
return err
}
}
for _, td := range res.BlkioThrottleReadIOPSDevice {
if err := WriteFile(cgroupPath, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range res.BlkioThrottleWriteIOPSDevice {
if err := WriteFile(cgroupPath, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}

View file

@ -168,7 +168,8 @@ func matchSyscall(filter *libseccomp.ScmpFilter, call *Syscall) error {
func toAction(act Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
switch act {
case ActKill:
return libseccomp.ActKill, nil
// lint was not passing until this was changed from ActKill to ActKilThread.
return libseccomp.ActKillThread, nil
case ActKillProcess:
return libseccomp.ActKillProcess, nil
case ActErrno:

View file

@ -1,9 +1,24 @@
package cgroups
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
// ErrDevicesUnsupported is an error returned when a cgroup manager
// is not configured to set device rules.
ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
// DevicesSetV1 and DevicesSetV2 are functions to set devices for
// cgroup v1 and v2, respectively. Unless libcontainer/cgroups/devices
// package is imported, it is set to nil, so cgroup managers can't
// manage devices.
DevicesSetV1 func(path string, r *configs.Resources) error
DevicesSetV2 func(path string, r *configs.Resources) error
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1

View file

@ -0,0 +1,311 @@
package fs
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
weightFilename string
weightDeviceFilename string
}
func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
s.detectWeightFilenames(path)
if r.BlkioWeight != 0 {
if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
if wd.Weight != 0 {
if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
return err
}
}
if wd.LeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}
/*
examples:
blkio.sectors
8:0 6792
blkio.io_service_bytes
8:0 Read 1282048
8:0 Write 2195456
8:0 Sync 2195456
8:0 Async 1282048
8:0 Total 3477504
Total 3477504
blkio.io_serviced
8:0 Read 124
8:0 Write 104
8:0 Sync 104
8:0 Async 124
8:0 Total 228
Total 228
blkio.io_queued
8:0 Read 0
8:0 Write 0
8:0 Sync 0
8:0 Async 0
8:0 Total 0
Total 0
*/
func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
}
return nil, err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
// format: dev type amount
fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
if len(fields) < 3 {
if len(fields) == 2 && fields[0] == "Total" {
// skip total line
continue
} else {
return nil, malformedLine(dir, file, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
minor := v
op := ""
valueField := 2
if len(fields) == 4 {
op = fields[2]
valueField = 3
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
if err := sc.Err(); err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
return blkioStats, nil
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
type blkioStatInfo struct {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
bfqDebugStats := []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.bfq.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.bfq.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.bfq.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.bfq.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.bfq.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
bfqStats := []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
cfqStats := []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
throttleRecursiveStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
baseStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
orderedStats := [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
throttleRecursiveStats,
baseStats,
}
var blkioStats []cgroups.BlkioStatEntry
var err error
for _, statGroup := range orderedStats {
for i, statInfo := range statGroup {
if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
// if error occurs on first file, move to next group
if i == 0 {
break
}
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
// finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
}
}
return nil
}
func (s *BlkioGroup) detectWeightFilenames(path string) {
if s.weightFilename != "" {
// Already detected.
return
}
if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
s.weightFilename = "blkio.weight"
s.weightDeviceFilename = "blkio.weight_device"
} else {
s.weightFilename = "blkio.bfq.weight"
s.weightDeviceFilename = "blkio.bfq.weight_device"
}
}

View file

@ -0,0 +1,129 @@
package fs
import (
"bufio"
"errors"
"fmt"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
type CpuGroup struct{}
func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, r); err != nil {
return err
}
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
// read it back
sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
if err != nil {
return err
}
// ... and check
if shares > sharesRead {
return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
} else if shares < sharesRead {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
var period string
if r.CpuPeriod != 0 {
period = strconv.FormatUint(r.CpuPeriod, 10)
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
// Sometimes when the period to be set is smaller
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_period exceeds the parent
// cgroup quota limit. If this happens and the quota is
// going to be set, ignore the error for now and retry
// after setting the quota.
if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
return err
}
} else {
period = ""
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
}
if period != "" {
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
return err
}
}
}
return s.SetRtSched(path, r)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
const file = "cpu.stat"
f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: path, File: file, Err: err}
}
switch t {
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_time":
stats.CpuStats.ThrottlingData.ThrottledTime = v
}
}
return nil
}

View file

@ -0,0 +1,166 @@
package fs
import (
"bufio"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
cgroupCpuacctStat = "cpuacct.stat"
cgroupCpuacctUsageAll = "cpuacct.usage_all"
nanosecondsInSecond = 1000000000
userModeColumn = 1
kernelModeColumn = 2
cuacctUsageAllColumnsNumber = 3
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicks uint64 = 100
)
type CpuacctGroup struct{}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
if err != nil {
return err
}
totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
if err != nil {
return err
}
percpuUsage, err := getPercpuUsage(path)
if err != nil {
return err
}
percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
if err != nil {
return err
}
stats.CpuStats.CpuUsage.TotalUsage = totalUsage
stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
return nil
}
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
var userModeUsage, kernelModeUsage uint64
const (
userField = "user"
systemField = "system"
file = cgroupCpuacctStat
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, 0, err
}
// TODO: use strings.SplitN instead.
fields := strings.Fields(data)
if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
return 0, 0, malformedLine(path, file, data)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
const file = "cpuacct.usage_percpu"
percpuUsage := []uint64{}
data, err := cgroups.ReadFile(path, file)
if err != nil {
return percpuUsage, err
}
// TODO: use strings.SplitN instead.
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, &parseError{Path: path, File: file, Err: err}
}
percpuUsage = append(percpuUsage, value)
}
return percpuUsage, nil
}
func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
const file = cgroupCpuacctUsageAll
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
return nil, nil, err
}
defer fd.Close()
scanner := bufio.NewScanner(fd)
scanner.Scan() // skipping header line
for scanner.Scan() {
lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
if len(lineFields) != cuacctUsageAllColumnsNumber {
continue
}
usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageKernelMode = append(usageKernelMode, usageInKernelMode)
usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageUserMode = append(usageUserMode, usageInUserMode)
}
if err := scanner.Err(); err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
return usageKernelMode, usageUserMode, nil
}

View file

@ -0,0 +1,245 @@
package fs
import (
"errors"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpusetGroup struct{}
func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error {
return s.ApplyDir(path, r, pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}
func getCpusetStat(path string, file string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, file)
if err != nil {
return extracted, err
}
if len(fileContent) == 0 {
return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
}
for _, s := range strings.Split(fileContent, ",") {
sp := strings.SplitN(s, "-", 3)
switch len(sp) {
case 3:
return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")}
case 2:
min, err := strconv.ParseUint(sp[0], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
max, err := strconv.ParseUint(sp[1], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
if min > max {
return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")}
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
}
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
extracted = append(extracted, uint16(value))
}
}
return extracted, nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
var err error
stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, r); err != nil {
return err
}
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(dir, pid)
}
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
}
// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
var st unix.Statfs_t
parent := filepath.Dir(current)
err := unix.Statfs(parent, &st)
if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
return nil
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
return err
}
return cpusetCopyIfNeeded(current, parent)
}
// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func cpusetCopyIfNeeded(current, parent string) error {
currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
if err != nil {
return err
}
parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
if err != nil {
return err
}
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil {
return err
}
}
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil {
return err
}
}
return nil
}
func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
return err
}
return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

View file

@ -0,0 +1,39 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type DevicesGroup struct{}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
if r.SkipDevices {
return nil
}
if path == "" {
// Return error here, since devices cgroup
// is a hard requirement for container's security.
return errSubsystemDoesNotExist
}
return apply(path, pid)
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if cgroups.DevicesSetV1 == nil {
if len(r.Devices) == 0 {
return nil
}
return cgroups.ErrDevicesUnsupported
}
return cgroups.DevicesSetV1(path, r)
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -0,0 +1,15 @@
package fs
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)
type parseError = fscommon.ParseError
// malformedLine is used by all cgroupfs file parsers that expect a line
// in a particular format but get some garbage instead.
func malformedLine(path, file, line string) error {
return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
}

View file

@ -0,0 +1,158 @@
package fs
import (
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type FreezerGroup struct{}
func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
return err
}
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}
}
// Despite our best efforts, it got stuck in FREEZING.
return errors.New("unable to freeze")
case configs.Thawed:
return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
}
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
switch strings.TrimSpace(state) {
case "THAWED":
return configs.Thawed, nil
case "FROZEN":
// Find out whether the cgroup is frozen directly,
// or indirectly via an ancestor.
self, err := cgroups.ReadFile(path, "freezer.self_freezing")
if err != nil {
// If the kernel is too old, then we just treat
// it as being frozen.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Frozen, err
}
switch self {
case "0\n":
return configs.Thawed, nil
case "1\n":
return configs.Frozen, nil
default:
return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
}
case "FREEZING":
// Make sure we get a stable freezer state, so retry if the cgroup
// is still undergoing freezing. This should be a temporary delay.
time.Sleep(1 * time.Millisecond)
continue
default:
return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
}
}
}

View file

@ -0,0 +1,264 @@
package fs
import (
"errors"
"fmt"
"os"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
var subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
func init() {
// If using cgroups-hybrid mode then add a "" controller indicating
// it should join the cgroups v2.
if cgroups.IsCgroup2HybridMode() {
subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
}
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// GetStats fills in the stats for the subsystem.
GetStats(path string, stats *cgroups.Stats) error
// Apply creates and joins a cgroup, adding pid into it. Some
// subsystems use resources to pre-configure the cgroup parents
// before creating or joining it.
Apply(path string, r *configs.Resources, pid int) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
}
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation")
}
if cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &manager{
cgroups: cg,
paths: paths,
}, nil
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if errors.Is(err, os.ErrPermission) {
return true
}
// Handle some specific syscall errors.
var errno unix.Errno
if errors.As(err, &errno) {
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
return false
}
func (m *manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
c := m.cgroups
for _, sys := range subsystems {
name := sys.Name()
p, ok := m.paths[name]
if !ok {
continue
}
if err := sys.Apply(p, c.Resources, pid); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems here, but do delete the path from
// the m.paths map, since it is either non-existent and could not
// be created, or the pid could not be added to it.
//
// Cases where limits for the subsystem have been set are handled
// later by Set, which fails with a friendly error (see
// if path == "" in Set).
if isIgnorableError(c.Rootless, err) && c.Path == "" {
delete(m.paths, name)
continue
}
return err
}
}
return nil
}
func (m *manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if path == "" {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
continue
}
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
return configs.Undefined, nil
}
freezer := &FreezerGroup{}
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View file

@ -0,0 +1,62 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct{}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return err
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return err
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return err
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pageSize] = hugetlbStats
}
return nil
}

View file

@ -0,0 +1,348 @@
package fs
import (
"bufio"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct{}
func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
r.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
return nil
}
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
return err
}
}
if r.OomKillDisable {
if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
}
return nil
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: path, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryData(path, "memsw")
if err != nil {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
pagesByNUMA, err := getPageUsageByNUMA(path)
if err != nil {
return err
}
stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
return nil
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
file = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer fd.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
//
// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
var field *cgroups.PageStats
line := scanner.Text()
columns := strings.SplitN(line, " ", maxColumns)
for i, column := range columns {
byNode := strings.SplitN(column, "=", 2)
// Some custom kernels have non-standard fields, like
// numa_locality 0 0 0 0 0 0 0 0 0 0
// numa_exectime 0
if len(byNode) < 2 {
if i == 0 {
// Ignore/skip those.
break
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, malformedLine(path, file, line)
}
}
key, val := byNode[0], byNode[1]
if i == 0 { // First column: key is name, val is total.
field = getNUMAField(&stats, key)
if field == nil { // unknown field (new kernel?)
break
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, malformedLine(path, file, line)
}
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes[uint8(n)] = usage
}
}
}
if err := scanner.Err(); err != nil {
return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
}
return stats, nil
}
func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
switch name {
case "total":
return &stats.Total
case "file":
return &stats.File
case "anon":
return &stats.Anon
case "unevictable":
return &stats.Unevictable
case "hierarchical_total":
return &stats.Hierarchical.Total
case "hierarchical_file":
return &stats.Hierarchical.File
case "hierarchical_anon":
return &stats.Hierarchical.Anon
case "hierarchical_unevictable":
return &stats.Hierarchical.Unevictable
}
return nil
}

View file

@ -0,0 +1,31 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
return s.GroupName
}
func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error {
if s.Join {
// Ignore errors if the named cgroup does not exist.
_ = apply(path, pid)
}
return nil
}
func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -0,0 +1,32 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetClsGroup struct{}
func (s *NetClsGroup) Name() string {
return "net_cls"
}
func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
return err
}
}
return nil
}
func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -0,0 +1,30 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetPrioGroup struct{}
func (s *NetPrioGroup) Name() string {
return "net_prio"
}
func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}
}
return nil
}
func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -0,0 +1,186 @@
package fs
import (
"errors"
"os"
"path/filepath"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func initPaths(cg *configs.Cgroup) (map[string]string, error) {
root, err := rootPath()
if err != nil {
return nil, err
}
inner, err := innerPath(cg)
if err != nil {
return nil, err
}
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := subsysPath(root, inner, name)
if err != nil {
// The non-presence of the devices subsystem
// is considered fatal for security reasons.
if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
continue
}
return nil, err
}
paths[name] = path
}
return paths, nil
}
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// rootPath finds and returns path to the root of the cgroup hierarchies.
func rootPath() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
func innerPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(c.Path)
if innerPath == "" {
cgParent := utils.CleanPath(c.Parent)
cgName := utils.CleanPath(c.Name)
innerPath = filepath.Join(cgParent, cgName)
}
return innerPath, nil
}
func subsysPath(root, inner, subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(inner) {
mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, inner), nil
}
func apply(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}

View file

@ -0,0 +1,24 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PerfEventGroup struct{}
func (s *PerfEventGroup) Name() string {
return "perf_event"
}
func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -0,0 +1,62 @@
package fs
import (
"math"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct{}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
if r.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
}
if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
current, err := fscommon.GetCgroupParamUint(path, "pids.current")
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, "pids.max")
if err != nil {
return err
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View file

@ -0,0 +1,25 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type RdmaGroup struct{}
func (s *RdmaGroup) Name() string {
return "rdma"
}
func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *RdmaGroup) Set(path string, r *configs.Resources) error {
return fscommon.RdmaSet(path, r)
}
func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
return fscommon.RdmaGetStats(path, stats)
}

View file

@ -0,0 +1,87 @@
package fs2
import (
"bufio"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
}
func setCpu(dirPath string, r *configs.Resources) error {
if !isCpuSet(r) {
return nil
}
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 || r.CpuPeriod != 0 {
str := "max"
if r.CpuQuota > 0 {
str = strconv.FormatInt(r.CpuQuota, 10)
}
period := r.CpuPeriod
if period == 0 {
// This default value is documented in
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
period = 100000
}
str += " " + strconv.FormatUint(period, 10)
if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
return err
}
}
return nil
}
func statCpu(dirPath string, stats *cgroups.Stats) error {
const file = "cpu.stat"
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
switch t {
case "usage_usec":
stats.CpuStats.CpuUsage.TotalUsage = v * 1000
case "user_usec":
stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
case "system_usec":
stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_usec":
stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
}
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
return nil
}

View file

@ -0,0 +1,28 @@
package fs2
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpusetSet(r *configs.Resources) bool {
return r.CpusetCpus != "" || r.CpusetMems != ""
}
func setCpuset(dirPath string, r *configs.Resources) error {
if !isCpusetSet(r) {
return nil
}
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}

View file

@ -0,0 +1,152 @@
package fs2
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers() (string, error) {
return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
// needAnyControllers returns whether we enable some supported controllers or not,
// based on (1) controllers available and (2) resources that are being set.
// We don't check "pseudo" controllers such as
// "freezer" and "devices".
func needAnyControllers(r *configs.Resources) (bool, error) {
if r == nil {
return false, nil
}
// list of all available controllers
content, err := supportedControllers()
if err != nil {
return false, err
}
avail := make(map[string]struct{})
for _, ctr := range strings.Fields(content) {
avail[ctr] = struct{}{}
}
// check whether the controller if available or not
have := func(controller string) bool {
_, ok := avail[controller]
return ok
}
if isPidsSet(r) && have("pids") {
return true, nil
}
if isMemorySet(r) && have("memory") {
return true, nil
}
if isIoSet(r) && have("io") {
return true, nil
}
if isCpuSet(r) && have("cpu") {
return true, nil
}
if isCpusetSet(r) && have("cpuset") {
return true, nil
}
if isHugeTlbSet(r) && have("hugetlb") {
return true, nil
}
return false, nil
}
// containsDomainController returns whether the current config contains domain controller or not.
// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
func containsDomainController(r *configs.Resources) bool {
return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r)
}
// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
if !strings.HasPrefix(path, UnifiedMountpoint) {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := supportedControllers()
if err != nil {
return err
}
const (
cgTypeFile = "cgroup.type"
cgStCtlFile = "cgroup.subtree_control"
)
ctrs := strings.Fields(content)
res := "+" + strings.Join(ctrs, " +")
elements := strings.Split(path, "/")
elements = elements[3:]
current := "/sys/fs"
for i, e := range elements {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0o755); err != nil {
if !os.IsExist(err) {
return err
}
} else {
// If the directory was created, be sure it is not left around on errors.
current := current
defer func() {
if Err != nil {
os.Remove(current)
}
}()
}
cgType, _ := cgroups.ReadFile(current, cgTypeFile)
cgType = strings.TrimSpace(cgType)
switch cgType {
// If the cgroup is in an invalid mode (usually this means there's an internal
// process in the cgroup tree, because we created a cgroup under an
// already-populated-by-other-processes cgroup), then we have to error out if
// the user requested controllers which are not thread-aware. However, if all
// the controllers requested are thread-aware we can simply put the cgroup into
// threaded mode.
case "domain invalid":
if containsDomainController(c.Resources) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
} else {
// Not entirely correct (in theory we'd always want to be a domain --
// since that means we're a properly delegated cgroup subtree) but in
// this case there's not much we can do and it's better than giving an
// error.
_ = cgroups.WriteFile(current, cgTypeFile, "threaded")
}
// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
// (and you cannot usually take a cgroup out of threaded mode).
case "domain threaded":
fallthrough
case "threaded":
if containsDomainController(c.Resources) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
}
}
}
// enable all supported controllers
if i < len(elements)-1 {
if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil {
// try write one by one
allCtrs := strings.Split(res, " ")
for _, ctr := range allCtrs {
_ = cgroups.WriteFile(current, cgStCtlFile, ctr)
}
}
// Some controllers might not be enabled when rootless or containerized,
// but we don't catch the error here. (Caught in setXXX() functions.)
}
}
return nil
}

View file

@ -0,0 +1,99 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fs2
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
const UnifiedMountpoint = "/sys/fs/cgroup"
func defaultDirPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name)
}
func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
if (cgName != "" || cgParent != "") && cgPath != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(cgPath)
if innerPath == "" {
cgParent := utils.CleanPath(cgParent)
cgName := utils.CleanPath(cgName)
innerPath = filepath.Join(cgParent, cgName)
}
if filepath.IsAbs(innerPath) {
return filepath.Join(root, innerPath), nil
}
ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
}
// The current user scope most probably has tasks in it already,
// making it impossible to enable controllers for its sub-cgroup.
// A parent cgroup (with no tasks in it) is what we need.
ownCgroup = filepath.Dir(ownCgroup)
return filepath.Join(root, ownCgroup, innerPath), nil
}
// parseCgroupFile parses /proc/PID/cgroup file and return string
func parseCgroupFile(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
return parseCgroupFromReader(f)
}
func parseCgroupFromReader(r io.Reader) (string, error) {
s := bufio.NewScanner(r)
for s.Scan() {
var (
text = s.Text()
parts = strings.SplitN(text, ":", 3)
)
if len(parts) < 3 {
return "", fmt.Errorf("invalid cgroup entry: %q", text)
}
// text is like "0::/user.slice/user-1001.slice/session-1.scope"
if parts[0] == "0" && parts[1] == "" {
return parts[2], nil
}
}
if err := s.Err(); err != nil {
return "", err
}
return "", errors.New("cgroup path not found")
}

View file

@ -0,0 +1,127 @@
package fs2
import (
"bufio"
"errors"
"fmt"
"os"
"strings"
"time"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func setFreezer(dirPath string, state configs.FreezerState) error {
var stateStr string
switch state {
case configs.Undefined:
return nil
case configs.Frozen:
stateStr = "1"
case configs.Thawed:
stateStr = "0"
default:
return fmt.Errorf("invalid freezer state %q requested", state)
}
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
if err != nil {
// We can ignore this request as long as the user didn't ask us to
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state != configs.Frozen {
return nil
}
return fmt.Errorf("freezer not supported: %w", err)
}
defer fd.Close()
if _, err := fd.WriteString(stateStr); err != nil {
return err
}
// Confirm that the cgroup did actually change states.
if actualState, err := readFreezer(dirPath, fd); err != nil {
return err
} else if actualState != state {
return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
}
return nil
}
func getFreezer(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY)
if err != nil {
// If the kernel is too old, then we just treat the freezer as being in
// an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
defer fd.Close()
return readFreezer(dirPath, fd)
}
func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) {
if _, err := fd.Seek(0, 0); err != nil {
return configs.Undefined, err
}
state := make([]byte, 2)
if _, err := fd.Read(state); err != nil {
return configs.Undefined, err
}
switch string(state) {
case "0\n":
return configs.Thawed, nil
case "1\n":
return waitFrozen(dirPath)
default:
return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state)
}
}
// waitFrozen polls cgroup.events until it sees "frozen 1" in it.
func waitFrozen(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY)
if err != nil {
return configs.Undefined, err
}
defer fd.Close()
// XXX: Simple wait/read/retry is used here. An implementation
// based on poll(2) or inotify(7) is possible, but it makes the code
// much more complicated. Maybe address this later.
const (
// Perform maxIter with waitTime in between iterations.
waitTime = 10 * time.Millisecond
maxIter = 1000
)
scanner := bufio.NewScanner(fd)
for i := 0; scanner.Scan(); {
if i == maxIter {
return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter)
}
line := scanner.Text()
val := strings.TrimPrefix(line, "frozen ")
if val != line { // got prefix
if val[0] == '1' {
return configs.Frozen, nil
}
i++
// wait, then re-read
time.Sleep(waitTime)
_, err := fd.Seek(0, 0)
if err != nil {
return configs.Undefined, err
}
}
}
// Should only reach here either on read error,
// or if the file does not contain "frozen " line.
return configs.Undefined, scanner.Err()
}

View file

@ -0,0 +1,271 @@
package fs2
import (
"errors"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type parseError = fscommon.ParseError
type manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
// controllers is content of "cgroup.controllers" file.
// excludes pseudo-controllers ("devices" and "freezer").
controllers map[string]struct{}
}
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
if err != nil {
return nil, err
}
}
m := &manager{
config: config,
dirPath: dirPath,
}
return m, nil
}
func (m *manager) getControllers() error {
if m.controllers != nil {
return nil
}
data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
if err != nil {
if m.config.Rootless && m.config.Path == "" {
return nil
}
return err
}
fields := strings.Fields(data)
m.controllers = make(map[string]struct{}, len(fields))
for _, c := range fields {
m.controllers[c] = struct{}{}
}
return nil
}
func (m *manager) Apply(pid int) error {
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.config.Rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
return nil
}
return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
}
}
return err
}
if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.dirPath)
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.dirPath)
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
var errs []error
st := cgroups.NewStats()
// pids (since kernel 4.5)
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
}
// memory (since kernel 4.5)
if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// io (since kernel 4.5)
if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// cpu (since kernel 4.15)
// Note cpu.stat is available even if the controller is not enabled.
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// rdma (since kernel 4.11)
if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if len(errs) > 0 && !m.config.Rootless {
return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
}
return st, nil
}
func (m *manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
if err := setFreezer(m.dirPath, state); err != nil {
return err
}
m.config.Resources.Freezer = state
return nil
}
func (m *manager) Destroy() error {
return cgroups.RemovePath(m.dirPath)
}
func (m *manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
// pids (since kernel 4.5)
if err := setPids(m.dirPath, r); err != nil {
return err
}
// memory (since kernel 4.5)
if err := setMemory(m.dirPath, r); err != nil {
return err
}
// io (since kernel 4.5)
if err := setIo(m.dirPath, r); err != nil {
return err
}
// cpu (since kernel 4.15)
if err := setCpu(m.dirPath, r); err != nil {
return err
}
// devices (since kernel 4.15, pseudo-controller)
//
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil {
if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
return err
}
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
return err
}
// hugetlb (since kernel 5.6)
if err := setHugeTlb(m.dirPath, r); err != nil {
return err
}
// rdma (since kernel 4.11)
if err := fscommon.RdmaSet(m.dirPath, r); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, r.Freezer); err != nil {
return err
}
if err := m.setUnified(r.Unified); err != nil {
return err
}
m.config.Resources = r
return nil
}
func setDevices(dirPath string, r *configs.Resources) error {
if cgroups.DevicesSetV2 == nil {
if len(r.Devices) > 0 {
return cgroups.ErrDevicesUnsupported
}
return nil
}
return cgroups.DevicesSetV2(dirPath, r)
}
func (m *manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
// Check if a controller is available,
// to give more specific error if not.
sk := strings.SplitN(k, ".", 2)
if len(sk) != 2 {
return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
}
c := sk[0]
if _, ok := m.controllers[c]; !ok && c != "cgroup" {
return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
}
}
return fmt.Errorf("unable to set unified resource %q: %w", k, err)
}
}
return nil
}
func (m *manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath
return paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View file

@ -0,0 +1,48 @@
package fs2
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isHugeTlbSet(r *configs.Resources) bool {
return len(r.HugetlbLimit) > 0
}
func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
return nil
}
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
for _, pagesize := range cgroups.HugePageSizes() {
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
if err != nil {
return err
}
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
if err != nil {
return err
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pagesize] = hugetlbStats
}
return nil
}

View file

@ -0,0 +1,193 @@
package fs2
import (
"bufio"
"bytes"
"fmt"
"os"
"strconv"
"strings"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isIoSet(r *configs.Resources) bool {
return r.BlkioWeight != 0 ||
len(r.BlkioWeightDevice) > 0 ||
len(r.BlkioThrottleReadBpsDevice) > 0 ||
len(r.BlkioThrottleWriteBpsDevice) > 0 ||
len(r.BlkioThrottleReadIOPSDevice) > 0 ||
len(r.BlkioThrottleWriteIOPSDevice) > 0
}
// bfqDeviceWeightSupported checks for per-device BFQ weight support (added
// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
func bfqDeviceWeightSupported(bfq *os.File) bool {
if bfq == nil {
return false
}
_, _ = bfq.Seek(0, 0)
buf := make([]byte, 32)
_, _ = bfq.Read(buf)
// If only a single number (default weight) if read back, we have older kernel.
_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
return err != nil
}
func setIo(dirPath string, r *configs.Resources) error {
if !isIoSet(r) {
return nil
}
// If BFQ IO scheduler is available, use it.
var bfq *os.File
if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 {
var err error
bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR)
if err == nil {
defer bfq.Close()
} else if !os.IsNotExist(err) {
return err
}
}
if r.BlkioWeight != 0 {
if bfq != nil { // Use BFQ.
if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
} else {
// Fallback to io.weight with a conversion scheme.
v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
return err
}
}
}
if bfqDeviceWeightSupported(bfq) {
for _, wd := range r.BlkioWeightDevice {
if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil {
return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err)
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
return err
}
}
return nil
}
func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
ret := map[string][]string{}
f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY)
if err != nil {
return nil, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Fields(line)
if len(parts) < 2 {
continue
}
ret[parts[0]] = parts[1:]
}
if err := scanner.Err(); err != nil {
return nil, &parseError{Path: dirPath, File: name, Err: err}
}
return ret, nil
}
func statIo(dirPath string, stats *cgroups.Stats) error {
const file = "io.stat"
values, err := readCgroup2MapFile(dirPath, file)
if err != nil {
return err
}
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
var parsedStats cgroups.BlkioStats
for k, v := range values {
d := strings.Split(k, ":")
if len(d) != 2 {
continue
}
major, err := strconv.ParseUint(d[0], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
minor, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
for _, item := range v {
d := strings.Split(item, "=")
if len(d) != 2 {
continue
}
op := d[0]
// Map to the cgroupv1 naming and layout (in separate tables).
var targetTable *[]cgroups.BlkioStatEntry
switch op {
// Equivalent to cgroupv1's blkio.io_service_bytes.
case "rbytes":
op = "Read"
targetTable = &parsedStats.IoServiceBytesRecursive
case "wbytes":
op = "Write"
targetTable = &parsedStats.IoServiceBytesRecursive
// Equivalent to cgroupv1's blkio.io_serviced.
case "rios":
op = "Read"
targetTable = &parsedStats.IoServicedRecursive
case "wios":
op = "Write"
targetTable = &parsedStats.IoServicedRecursive
default:
// Skip over entries we cannot map to cgroupv1 stats for now.
// In the future we should expand the stats struct to include
// them.
logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
continue
}
value, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
entry := cgroups.BlkioStatEntry{
Op: op,
Major: major,
Minor: minor,
Value: value,
}
*targetTable = append(*targetTable, entry)
}
}
stats.BlkioStats = parsedStats
return nil
}

View file

@ -0,0 +1,216 @@
package fs2
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
// numToStr converts an int64 value to a string for writing to a
// cgroupv2 files with .min, .max, .low, or .high suffix.
// The value of -1 is converted to "max" for cgroupv1 compatibility
// (which used to write -1 to remove the limit).
func numToStr(value int64) (ret string) {
switch {
case value == 0:
ret = ""
case value == -1:
ret = "max"
default:
ret = strconv.FormatInt(value, 10)
}
return ret
}
func isMemorySet(r *configs.Resources) bool {
return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
}
func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
return nil
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
}
swapStr := numToStr(swap)
if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
// memory and memorySwap set to the same value -- disable swap
swapStr = "0"
}
// never write empty string to `memory.swap.max`, it means set to 0.
if swapStr != "" {
if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
return err
}
}
if val := numToStr(r.Memory); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil {
return err
}
}
// cgroup.Resources.KernelMemory is ignored
if val := numToStr(r.MemoryReservation); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil {
return err
}
}
return nil
}
func statMemory(dirPath string, stats *cgroups.Stats) error {
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
// cgroup v2 is always hierarchical.
stats.MemoryStats.UseHierarchy = true
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo.
return statsFromMeminfo(stats)
}
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryDataV2(dirPath, "swap")
if err != nil {
return err
}
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
swapUsage.Usage += memoryUsage.Usage
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
stats.MemoryStats.SwapUsage = swapUsage
return nil
}
func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
usage := moduleName + ".current"
limit := moduleName + ".max"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore EEXIST as there's no swap accounting
// if kernel CONFIG_MEMCG_SWAP is not set or
// swapaccount=0 kernel boot parameter is given.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func statsFromMeminfo(stats *cgroups.Stats) error {
const file = "/proc/meminfo"
f, err := os.Open(file)
if err != nil {
return err
}
defer f.Close()
// Fields we are interested in.
var (
swap_free uint64
swap_total uint64
main_total uint64
main_free uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
"MemTotal": &main_total,
"MemFree": &main_free,
}
found := 0
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.SplitN(sc.Text(), ":", 3)
if len(parts) != 2 {
// Should not happen.
continue
}
k := parts[0]
p, ok := mem[k]
if !ok {
// Unknown field -- not interested.
continue
}
vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
*p, err = strconv.ParseUint(vStr, 10, 64)
if err != nil {
return &parseError{File: file, Err: errors.New("bad value for " + k)}
}
found++
if found == len(mem) {
// Got everything we need -- skip the rest.
break
}
}
if err := sc.Err(); err != nil {
return &parseError{Path: "", File: file, Err: err}
}
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
stats.MemoryStats.Usage.Limit = math.MaxUint64
return nil
}

View file

@ -0,0 +1,72 @@
package fs2
import (
"errors"
"math"
"os"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isPidsSet(r *configs.Resources) bool {
return r.PidsLimit != 0
}
func setPids(dirPath string, r *configs.Resources) error {
if !isPidsSet(r) {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
}
return nil
}
func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := cgroups.ReadFile(dirPath, "cgroup.procs")
if errors.Is(err, unix.ENOTSUP) {
contents, err = cgroups.ReadFile(dirPath, "cgroup.threads")
}
if err != nil {
return err
}
pids := strings.Count(contents, "\n")
stats.PidsStats.Current = uint64(pids)
stats.PidsStats.Limit = 0
return nil
}
func statPids(dirPath string, stats *cgroups.Stats) error {
current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
if err != nil {
if os.IsNotExist(err) {
return statPidsFromCgroupProcs(dirPath, stats)
}
return err
}
max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max")
if err != nil {
return err
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View file

@ -0,0 +1,121 @@
package fscommon
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
// parseRdmaKV parses raw string to RdmaEntry.
func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error {
var value uint32
parts := strings.SplitN(raw, "=", 3)
if len(parts) != 2 {
return errors.New("Unable to parse RDMA entry")
}
k, v := parts[0], parts[1]
if v == "max" {
value = math.MaxUint32
} else {
val64, err := strconv.ParseUint(v, 10, 32)
if err != nil {
return err
}
value = uint32(val64)
}
if k == "hca_handle" {
entry.HcaHandles = value
} else if k == "hca_object" {
entry.HcaObjects = value
}
return nil
}
// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file.
// example entry: mlx4_0 hca_handle=2 hca_object=2000
func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) {
rdmaEntries := make([]cgroups.RdmaEntry, 0)
fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return nil, err
}
defer fd.Close() //nolint:errorlint
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
parts := strings.SplitN(scanner.Text(), " ", 4)
if len(parts) == 3 {
entry := new(cgroups.RdmaEntry)
entry.Device = parts[0]
err = parseRdmaKV(parts[1], entry)
if err != nil {
continue
}
err = parseRdmaKV(parts[2], entry)
if err != nil {
continue
}
rdmaEntries = append(rdmaEntries, *entry)
}
}
return rdmaEntries, scanner.Err()
}
// RdmaGetStats returns rdma stats such as totalLimit and current entries.
func RdmaGetStats(path string, stats *cgroups.Stats) error {
currentEntries, err := readRdmaEntries(path, "rdma.current")
if err != nil {
if errors.Is(err, os.ErrNotExist) {
err = nil
}
return err
}
maxEntries, err := readRdmaEntries(path, "rdma.max")
if err != nil {
return err
}
// If device got removed between reading two files, ignore returning stats.
if len(currentEntries) != len(maxEntries) {
return nil
}
stats.RdmaStats = cgroups.RdmaStats{
RdmaLimit: maxEntries,
RdmaCurrent: currentEntries,
}
return nil
}
func createCmdString(device string, limits configs.LinuxRdma) string {
cmdString := device
if limits.HcaHandles != nil {
cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10)
}
if limits.HcaObjects != nil {
cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10)
}
return cmdString
}
// RdmaSet sets RDMA resources.
func RdmaSet(path string, r *configs.Resources) error {
for device, limits := range r.Rdma {
if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil {
return err
}
}
return nil
}

View file

@ -0,0 +1,145 @@
package fscommon
import (
"errors"
"fmt"
"math"
"path"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
var (
// Deprecated: use cgroups.OpenFile instead.
OpenFile = cgroups.OpenFile
// Deprecated: use cgroups.ReadFile instead.
ReadFile = cgroups.ReadFile
// Deprecated: use cgroups.WriteFile instead.
WriteFile = cgroups.WriteFile
)
// ParseError records a parse error details, including the file path.
type ParseError struct {
Path string
File string
Err error
}
func (e *ParseError) Error() string {
return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error()
}
func (e *ParseError) Unwrap() error { return e.Err }
// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
func ParseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// ParseKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (ParseUint is used to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
parts := strings.SplitN(t, " ", 3)
if len(parts) != 2 {
return "", 0, fmt.Errorf("line %q is not in key value format", t)
}
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, err
}
return parts[0], value, nil
}
// GetValueByKey reads a key-value pairs from the specified cgroup file,
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, err
}
lines := strings.Split(content, "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
val, err := ParseUint(arr[1], 10, 64)
if err != nil {
err = &ParseError{Path: path, File: file, Err: err}
}
return val, err
}
}
return 0, nil
}
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
// If the value read is "max", the math.MaxUint64 is returned.
func GetCgroupParamUint(path, file string) (uint64, error) {
contents, err := GetCgroupParamString(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxUint64, nil
}
res, err := ParseUint(contents, 10, 64)
if err != nil {
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
contents, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxInt64, nil
}
res, err := strconv.ParseInt(contents, 10, 64)
if err != nil {
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}
// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
contents, err := cgroups.ReadFile(path, file)
if err != nil {
return "", err
}
return strings.TrimSpace(contents), nil
}

View file

@ -2,8 +2,8 @@ package configs
import "fmt"
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// BlockIODevice holds major:minor format supported in blkio cgroup.
type BlockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
@ -12,7 +12,7 @@ type blockIODevice struct {
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
blockIODevice
BlockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
@ -41,7 +41,7 @@ func (wd *WeightDevice) LeafWeightString() string {
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
blockIODevice
BlockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}

View file

@ -83,9 +83,6 @@ type Syscall struct {
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs

View file

@ -35,12 +35,6 @@ type Mount struct {
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {

View file

@ -132,19 +132,16 @@ func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
return fn(procfd)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
// SearchLabels searches through a list of key=value pairs for a given key,
// returning its value, and the binary flag telling whether the key exist.
func SearchLabels(labels []string, key string) (string, bool) {
key += "="
for _, s := range labels {
if strings.HasPrefix(s, key) {
return s[len(key):], true
}
}
return ""
return "", false
}
// Annotations returns the bundle path and user defined annotations from the

View file

@ -2,6 +2,31 @@ libseccomp-golang: Releases
===============================================================================
https://github.com/seccomp/libseccomp-golang
* Version 0.10.0 - June 9, 2022
- Minimum supported version of libseccomp bumped to v2.3.1
- Add seccomp userspace notification API (ActNotify, filter.*Notif*)
- Add filter.{Get,Set}SSB (to support SCMP_FLTATR_CTL_SSB)
- Add filter.{Get,Set}Optimize (to support SCMP_FLTATR_CTL_OPTIMIZE)
- Add filter.{Get,Set}RawRC (to support SCMP_FLTATR_API_SYSRAWRC)
- Add ArchPARISC, ArchPARISC64, ArchRISCV64
- Add ActKillProcess and ActKillThread; deprecate ActKill
- Add go module support
- Return ErrSyscallDoesNotExist when unable to resolve a syscall
- Fix some functions to check for both kernel level API and libseccomp version
- Fix MakeCondition to use sanitizeCompareOp
- Fix AddRule to handle EACCES (from libseccomp >= 2.5.0)
- Updated the main docs and converted to README.md
- Added CONTRIBUTING.md, SECURITY.md, and administrative docs under doc/admin
- Add GitHub action CI, enable more linters
- test: test against various libseccomp versions
- test: fix and simplify execInSubprocess
- test: fix APILevelIsSupported
- Refactor the Errno(-1 * retCode) pattern
- Refactor/unify libseccomp version / API level checks
- Code cleanups (linter, formatting, spelling fixes)
- Cleanup: use errors.New instead of fmt.Errorf where appropriate
- Cleanup: remove duplicated cgo stuff, redundant linux build tag
* Version 0.9.1 - May 21, 2019
- Minimum supported version of libseccomp bumped to v2.2.0
- Use Libseccomp's `seccomp_version` API to retrieve library version

View file

@ -22,19 +22,37 @@ The library source repository currently lives on GitHub at the following URLs:
* https://github.com/seccomp/libseccomp-golang
* https://github.com/seccomp/libseccomp
The project mailing list is currently hosted on Google Groups at the URL below,
please note that a Google account is not required to subscribe to the mailing
list.
* https://groups.google.com/d/forum/libseccomp
Documentation for this package is also available at:
* https://pkg.go.dev/github.com/seccomp/libseccomp-golang
## Verifying Releases
Starting with libseccomp-golang v0.10.0, the git tag corresponding to each
release should be signed by one of the libseccomp-golang maintainers. It is
recommended that before use you verify the release tags using the following
command:
% git tag -v <tag>
At present, only the following keys, specified via the fingerprints below, are
authorized to sign official libseccomp-golang release tags:
Paul Moore <paul@paul-moore.com>
7100 AADF AE6E 6E94 0D2E 0AD6 55E4 5A5A E8CA 7C8A
Tom Hromatka <tom.hromatka@oracle.com>
47A6 8FCE 37C7 D702 4FD6 5E11 356C E62C 2B52 4099
Kir Kolyshkin <kolyshkin@gmail.com>
C242 8CD7 5720 FACD CF76 B6EA 17DE 5ECB 75A1 100E
More information on GnuPG and git tag verification can be found at their
respective websites: https://git-scm.com/docs/git and https://gnupg.org.
## Installing the package
# go get github.com/seccomp/libseccomp-golang
% go get github.com/seccomp/libseccomp-golang
## Contributing

View file

@ -22,6 +22,7 @@ window.
* Paul Moore, paul@paul-moore.com
* Tom Hromatka, tom.hromatka@oracle.com
* Kir Kolyshkin, kolyshkin@gmail.com
### Resolving Sensitive Security Issues

View file

@ -7,6 +7,7 @@
package seccomp
import (
"errors"
"fmt"
"os"
"runtime"
@ -245,8 +246,8 @@ const (
)
// ErrSyscallDoesNotExist represents an error condition where
// libseccomp is unable to resolve the syscall
var ErrSyscallDoesNotExist = fmt.Errorf("could not resolve syscall name")
// libseccomp is unable to resolve the syscall.
var ErrSyscallDoesNotExist = errors.New("could not resolve syscall name")
const (
// Userspace notification response flags
@ -556,7 +557,7 @@ func MakeCondition(arg uint, comparison ScmpCompareOp, values ...uint64) (ScmpCo
} else if len(values) > 2 {
return condStruct, fmt.Errorf("conditions can have at most 2 arguments (%d given)", len(values))
} else if len(values) == 0 {
return condStruct, fmt.Errorf("must provide at least one value to compare against")
return condStruct, errors.New("must provide at least one value to compare against")
}
condStruct.Argument = arg
@ -611,7 +612,7 @@ func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
fPtr := C.seccomp_init(defaultAction.toNative())
if fPtr == nil {
return nil, fmt.Errorf("could not create filter")
return nil, errors.New("could not create filter")
}
filter := new(ScmpFilter)
@ -623,7 +624,7 @@ func NewFilter(defaultAction ScmpAction) (*ScmpFilter, error) {
// If the kernel does not support TSYNC, allow us to continue without error.
if err := filter.setFilterAttr(filterAttrTsync, 0x1); err != nil && err != syscall.ENOTSUP {
filter.Release()
return nil, fmt.Errorf("could not create filter - error setting tsync bit: %v", err)
return nil, fmt.Errorf("could not create filter: error setting tsync bit: %w", err)
}
return filter, nil
@ -695,14 +696,14 @@ func (f *ScmpFilter) Merge(src *ScmpFilter) error {
defer src.lock.Unlock()
if !src.valid || !f.valid {
return fmt.Errorf("one or more of the filter contexts is invalid or uninitialized")
return errors.New("one or more of the filter contexts is invalid or uninitialized")
}
// Merge the filters
if retCode := C.seccomp_merge(f.filterCtx, src.filterCtx); retCode != 0 {
e := errRc(retCode)
if e == syscall.EINVAL {
return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter")
return fmt.Errorf("filters could not be merged due to a mismatch in attributes or invalid filter: %w", e)
}
return e
}

View file

@ -340,7 +340,7 @@ func ensureSupportedVersion() error {
func getAPI() (uint, error) {
api := C.seccomp_api_get()
if api == 0 {
return 0, fmt.Errorf("API level operations are not supported")
return 0, errors.New("API level operations are not supported")
}
return uint(api), nil
@ -349,11 +349,12 @@ func getAPI() (uint, error) {
// Set the API level
func setAPI(api uint) error {
if retCode := C.seccomp_api_set(C.uint(api)); retCode != 0 {
if errRc(retCode) == syscall.EOPNOTSUPP {
return fmt.Errorf("API level operations are not supported")
e := errRc(retCode)
if e == syscall.EOPNOTSUPP {
return errors.New("API level operations are not supported")
}
return fmt.Errorf("could not set API level: %v", retCode)
return fmt.Errorf("could not set API level: %w", e)
}
return nil
@ -411,7 +412,7 @@ func (f *ScmpFilter) setFilterAttr(attr scmpFilterAttr, value C.uint32_t) error
// Wrapper for seccomp_rule_add_... functions
func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact bool, length C.uint, cond C.scmp_cast_t) error {
if length != 0 && cond == nil {
return fmt.Errorf("null conditions list, but length is nonzero")
return errors.New("null conditions list, but length is nonzero")
}
var retCode C.int
@ -430,7 +431,7 @@ func (f *ScmpFilter) addRuleWrapper(call ScmpSyscall, action ScmpAction, exact b
case syscall.EPERM, syscall.EACCES:
return errDefAction
case syscall.EINVAL:
return fmt.Errorf("two checks on same syscall argument")
return errors.New("two checks on same syscall argument")
default:
return e
}
@ -455,7 +456,7 @@ func (f *ScmpFilter) addRuleGeneric(call ScmpSyscall, action ScmpAction, exact b
} else {
argsArr := C.make_arg_cmp_array(C.uint(len(conds)))
if argsArr == nil {
return fmt.Errorf("error allocating memory for conditions")
return errors.New("error allocating memory for conditions")
}
defer C.free(argsArr)
@ -495,7 +496,7 @@ func sanitizeAction(in ScmpAction) error {
}
if inTmp != ActTrace && inTmp != ActErrno && (in&0xFFFF0000) != 0 {
return fmt.Errorf("highest 16 bits must be zeroed except for Trace and Errno")
return errors.New("highest 16 bits must be zeroed except for Trace and Errno")
}
return nil

10
vendor/modules.txt vendored
View file

@ -111,7 +111,7 @@ github.com/containers/buildah/pkg/rusage
github.com/containers/buildah/pkg/sshagent
github.com/containers/buildah/pkg/util
github.com/containers/buildah/util
# github.com/containers/common v0.48.1-0.20220608111710-dbecabbe82c9
# github.com/containers/common v0.48.1-0.20220624132904-722a80e139ec
## explicit
github.com/containers/common/libimage
github.com/containers/common/libimage/define
@ -555,10 +555,13 @@ github.com/opencontainers/go-digest
## explicit
github.com/opencontainers/image-spec/specs-go
github.com/opencontainers/image-spec/specs-go/v1
# github.com/opencontainers/runc v1.1.3
# github.com/opencontainers/runc v1.1.3 => github.com/opencontainers/runc v1.1.1-0.20220617142545-8b9452f75cbc
## explicit
github.com/opencontainers/runc/libcontainer/apparmor
github.com/opencontainers/runc/libcontainer/cgroups
github.com/opencontainers/runc/libcontainer/cgroups/fs
github.com/opencontainers/runc/libcontainer/cgroups/fs2
github.com/opencontainers/runc/libcontainer/cgroups/fscommon
github.com/opencontainers/runc/libcontainer/configs
github.com/opencontainers/runc/libcontainer/devices
github.com/opencontainers/runc/libcontainer/user
@ -628,7 +631,7 @@ github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/tcp
github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp
github.com/rootless-containers/rootlesskit/pkg/port/builtin/parent/udp/udpproxy
github.com/rootless-containers/rootlesskit/pkg/port/portutil
# github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646
# github.com/seccomp/libseccomp-golang v0.10.0
github.com/seccomp/libseccomp-golang
# github.com/sirupsen/logrus v1.8.1
## explicit
@ -869,3 +872,4 @@ gopkg.in/yaml.v2
gopkg.in/yaml.v3
# sigs.k8s.io/yaml v1.3.0
sigs.k8s.io/yaml
# github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20220617142545-8b9452f75cbc