Skip to content

[FIX] CNIEnv concurrency issue #4202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 50 additions & 96 deletions pkg/netutil/netutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ import (

"github.com/containerd/nerdctl/v2/pkg/api/types"
"github.com/containerd/nerdctl/v2/pkg/labels"
"github.com/containerd/nerdctl/v2/pkg/lockutil"
"github.com/containerd/nerdctl/v2/pkg/netutil/nettype"
subnetutil "github.com/containerd/nerdctl/v2/pkg/netutil/subnet"
"github.com/containerd/nerdctl/v2/pkg/strutil"
Expand All @@ -53,14 +52,7 @@ type CNIEnv struct {
type CNIEnvOpt func(e *CNIEnv) error

func (e *CNIEnv) ListNetworksMatch(reqs []string, allowPseudoNetwork bool) (list map[string][]*NetworkConfig, errs []error) {
var err error

var networkConfigs []*NetworkConfig
// NOTE: we cannot lock NetconfPath directly, as Cilium (maybe others) are also locking it.
err = lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), func() error {
networkConfigs, err = e.networkConfigList()
return err
})
networkConfigs, err := fsRead(e)
if err != nil {
return nil, []error{err}
}
Expand Down Expand Up @@ -188,7 +180,8 @@ func WithDefaultNetwork(bridgeIP string) CNIEnvOpt {

func WithNamespace(namespace string) CNIEnvOpt {
return func(e *CNIEnv) error {
if err := os.MkdirAll(filepath.Join(e.NetconfPath, namespace), 0755); err != nil {
err := fsEnsureRoot(e, namespace)
if err != nil {
return err
}
e.Namespace = namespace
Expand All @@ -201,7 +194,8 @@ func NewCNIEnv(cniPath, cniConfPath string, opts ...CNIEnvOpt) (*CNIEnv, error)
Path: cniPath,
NetconfPath: cniConfPath,
}
if err := os.MkdirAll(e.NetconfPath, 0755); err != nil {

if err := fsEnsureRoot(&e, ""); err != nil {
return nil, err
}

Expand All @@ -215,25 +209,17 @@ func NewCNIEnv(cniPath, cniConfPath string, opts ...CNIEnvOpt) (*CNIEnv, error)
}

func (e *CNIEnv) NetworkList() ([]*NetworkConfig, error) {
var netConfigList []*NetworkConfig
var err error
fn := func() error {
netConfigList, err = e.networkConfigList()
return err
}
err = lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), fn)

return netConfigList, err
return fsRead(e)
}

func (e *CNIEnv) NetworkMap() (map[string]*NetworkConfig, error) { //nolint:revive
networks, err := e.networkConfigList()
netConfigList, err := fsRead(e)
if err != nil {
return nil, err
}

m := make(map[string]*NetworkConfig, len(networks))
for _, n := range networks {
m := make(map[string]*NetworkConfig, len(netConfigList))
for _, n := range netConfigList {
if original, exists := m[n.Name]; exists {
log.L.Warnf("duplicate network name %q, %#v will get superseded by %#v", n.Name, original, n)
}
Expand All @@ -243,12 +229,12 @@ func (e *CNIEnv) NetworkMap() (map[string]*NetworkConfig, error) { //nolint:revi
}

func (e *CNIEnv) NetworkByNameOrID(key string) (*NetworkConfig, error) {
networks, err := e.networkConfigList()
netConfigList, err := fsRead(e)
if err != nil {
return nil, err
}

for _, n := range networks {
for _, n := range netConfigList {
if n.Name == key {
return n, nil
}
Expand All @@ -261,36 +247,31 @@ func (e *CNIEnv) NetworkByNameOrID(key string) (*NetworkConfig, error) {
}

func (e *CNIEnv) filterNetworks(filterf func(*NetworkConfig) bool) ([]*NetworkConfig, error) {
networkConfigs, err := e.networkConfigList()
netConfigList, err := fsRead(e)
if err != nil {
return nil, err
}
result := []*NetworkConfig{}
for _, networkConfig := range networkConfigs {
for _, networkConfig := range netConfigList {
if filterf(networkConfig) {
result = append(result, networkConfig)
}
}
return result, nil
}

func (e *CNIEnv) getConfigPathForNetworkName(netName string) string {
if netName == DefaultNetworkName || e.Namespace == "" {
return filepath.Join(e.NetconfPath, "nerdctl-"+netName+".conflist")
}
return filepath.Join(e.NetconfPath, e.Namespace, "nerdctl-"+netName+".conflist")
}

func (e *CNIEnv) usedSubnets() ([]*net.IPNet, error) {
usedSubnets, err := subnetutil.GetLiveNetworkSubnets()
if err != nil {
return nil, err
}
networkConfigs, err := e.networkConfigList()

netConfigList, err := fsRead(e)
if err != nil {
return nil, err
}
for _, netConf := range networkConfigs {

for _, netConf := range netConfigList {
usedSubnets = append(usedSubnets, netConf.subnets()...)
}
return usedSubnets, nil
Expand All @@ -314,44 +295,39 @@ type cniNetworkConfig struct {
func (e *CNIEnv) CreateNetwork(opts types.NetworkCreateOptions) (*NetworkConfig, error) { //nolint:revive
var netConf *NetworkConfig

fn := func() error {
netMap, err := e.NetworkMap()
if err != nil {
return err
}
netMap, err := e.NetworkMap()
if err != nil {
return nil, err
}

if _, ok := netMap[opts.Name]; ok {
return errdefs.ErrAlreadyExists
}
ipam, err := e.generateIPAM(opts.IPAMDriver, opts.Subnets, opts.Gateway, opts.IPRange, opts.IPAMOptions, opts.IPv6)
if err != nil {
return err
}
plugins, err := e.generateCNIPlugins(opts.Driver, opts.Name, ipam, opts.Options, opts.IPv6)
if err != nil {
return err
}
netConf, err = e.generateNetworkConfig(opts.Name, opts.Labels, plugins)
if err != nil {
return err
}
return e.writeNetworkConfig(netConf)
// See note in fsWrite. Just because it does not exist now does not guarantee it will still not exist later.
// This is more a perf optimization at this point than a true check.
if _, ok := netMap[opts.Name]; ok {
return nil, errdefs.ErrAlreadyExists
}
err := lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), fn)
ipam, err := e.generateIPAM(opts.IPAMDriver, opts.Subnets, opts.Gateway, opts.IPRange, opts.IPAMOptions, opts.IPv6)
if err != nil {
return nil, err
}
plugins, err := e.generateCNIPlugins(opts.Driver, opts.Name, ipam, opts.Options, opts.IPv6)
if err != nil {
return nil, err
}
netConf, err = e.generateNetworkConfig(opts.Name, opts.Labels, plugins)
if err != nil {
return nil, err
}
err = fsWrite(e, netConf)

// See note above. If it exists, we got raced out by another process. Consider this to NOT be a hard error.
if err != nil && !errdefs.IsAlreadyExists(err) {
return nil, err
}
return netConf, nil
}

func (e *CNIEnv) RemoveNetwork(net *NetworkConfig) error {
fn := func() error {
if err := os.RemoveAll(net.File); err != nil {
return err
}
return net.clean()
}
return lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), fn)
return fsRemove(e, net)
}

// GetDefaultNetworkConfig checks whether the default network exists
Expand Down Expand Up @@ -394,8 +370,8 @@ func (e *CNIEnv) GetDefaultNetworkConfig() (*NetworkConfig, error) {

// Warn the user if the default network was not created by nerdctl.
match := nameMatches[0]
_, statErr := os.Stat(e.getConfigPathForNetworkName(DefaultNetworkName))
if match.NerdctlID == nil || statErr != nil {
exists, statErr := fsExists(e, DefaultNetworkName)
if match.NerdctlID == nil || statErr != nil || !exists {
log.L.Warnf("default network named %q does not have an internal nerdctl ID or nerdctl-managed config file, it was most likely NOT created by nerdctl", DefaultNetworkName)
}

Expand All @@ -419,9 +395,12 @@ func (e *CNIEnv) ensureDefaultNetworkConfig(bridgeIP string) error {
}

func (e *CNIEnv) createDefaultNetworkConfig(bridgeIP string) error {
filename := e.getConfigPathForNetworkName(DefaultNetworkName)
if _, err := os.Stat(filename); err == nil {
return fmt.Errorf("already found existing network config at %q, cannot create new network named %q", filename, DefaultNetworkName)
exist, err := fsExists(e, DefaultNetworkName)
if err != nil && !os.IsNotExist(err) {
return err
}
if exist {
return fmt.Errorf("already found existing network config, cannot create new network named %q", DefaultNetworkName)
}

bridgeCIDR := DefaultCIDR
Expand All @@ -443,7 +422,7 @@ func (e *CNIEnv) createDefaultNetworkConfig(bridgeIP string) error {
Labels: []string{fmt.Sprintf("%s=true", labels.NerdctlDefaultNetwork)},
}

_, err := e.CreateNetwork(opts)
_, err = e.CreateNetwork(opts)
if err != nil && !errdefs.IsAlreadyExists(err) {
return err
}
Expand Down Expand Up @@ -490,31 +469,6 @@ func (e *CNIEnv) generateNetworkConfig(name string, labels []string, plugins []C
}, nil
}

// writeNetworkConfig writes NetworkConfig file to cni config path.
func (e *CNIEnv) writeNetworkConfig(net *NetworkConfig) error {
filename := e.getConfigPathForNetworkName(net.Name)
if _, err := os.Stat(filename); err == nil {
return errdefs.ErrAlreadyExists
}
return os.WriteFile(filename, net.Bytes, 0644)
}

// networkConfigList loads config from dir if dir exists.
func (e *CNIEnv) networkConfigList() ([]*NetworkConfig, error) {
common, err := libcni.ConfFiles(e.NetconfPath, []string{".conf", ".conflist", ".json"})
if err != nil {
return nil, err
}
namespaced := []string{}
if e.Namespace != "" {
namespaced, err = libcni.ConfFiles(filepath.Join(e.NetconfPath, e.Namespace), []string{".conf", ".conflist", ".json"})
if err != nil {
return nil, err
}
}
return cniLoad(append(common, namespaced...))
}

func wrapCNIError(fileName string, err error) error {
return fmt.Errorf("failed marshalling json out of network configuration file %q: %w\n"+
"For details on the schema, see https://pkg.go.dev/github.com/containernetworking/cni/libcni#NetworkConfigList", fileName, err)
Expand Down
100 changes: 100 additions & 0 deletions pkg/netutil/store.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
Copyright The containerd Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package netutil

import (
"os"
"path/filepath"

"github.com/containernetworking/cni/libcni"

"github.com/containerd/errdefs"

"github.com/containerd/nerdctl/v2/pkg/lockutil"
)

// NOTE: libcni is not safe to use concurrently - or at least delegates concurrency management to the consumer.
// Furthermore, CNIEnv (prior to this) is assuming the filesystem is ACID and other TOCTOU faults.
// This small set of methods here are meant to isolate CNIEnv entirely from the filesystem.
// This is NOT proper - we should instead use the Store implementation, which is the generic abstraction for ACID
// operations - but for now that will do, waiting for a full rewrite of CNIEnv.

func fsEnsureRoot(e *CNIEnv, namespace string) error {
path := e.NetconfPath
if namespace != "" {
path = filepath.Join(e.NetconfPath, namespace)
}
return os.MkdirAll(path, 0755)
}

func fsRemove(e *CNIEnv, net *NetworkConfig) error {
fn := func() error {
if err := os.RemoveAll(net.File); err != nil {
return err
}
return net.clean()
}
return lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), fn)
}

func fsExists(e *CNIEnv, name string) (bool, error) {
fi, err := os.Stat(getConfigPathForNetworkName(e, name))
return !os.IsNotExist(err) && !fi.IsDir(), err
}

func fsWrite(e *CNIEnv, net *NetworkConfig) error {
filename := getConfigPathForNetworkName(e, net.Name)
// FIXME: note that this is still problematic.
// Concurrent access may independently first figure out that a given network is missing, and while the lock
// here will prevent concurrent writes, one of the routines will fail.
// Consuming code MUST account for that scenario.
return lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), func() error {
if _, err := os.Stat(filename); err == nil {
return errdefs.ErrAlreadyExists
}
return os.WriteFile(filename, net.Bytes, 0644)
})
}

func fsRead(e *CNIEnv) ([]*NetworkConfig, error) {
var nc []*NetworkConfig
var err error
err = lockutil.WithDirLock(filepath.Join(e.NetconfPath, ".nerdctl.lock"), func() error {
namespaced := []string{}
var common []string
common, err = libcni.ConfFiles(e.NetconfPath, []string{".conf", ".conflist", ".json"})
if err != nil {
return err
}
if e.Namespace != "" {
namespaced, err = libcni.ConfFiles(filepath.Join(e.NetconfPath, e.Namespace), []string{".conf", ".conflist", ".json"})
if err != nil {
return err
}
}
nc, err = cniLoad(append(common, namespaced...))
return err
})
return nc, err
}

func getConfigPathForNetworkName(e *CNIEnv, netName string) string {
if netName == DefaultNetworkName || e.Namespace == "" {
return filepath.Join(e.NetconfPath, "nerdctl-"+netName+".conflist")
}
return filepath.Join(e.NetconfPath, e.Namespace, "nerdctl-"+netName+".conflist")
}
4 changes: 3 additions & 1 deletion pkg/ocihook/ocihook.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ func Run(stdin io.Reader, stderr io.Writer, event, dataStore, cniPath, cniNetcon
// This below is a stopgap solution that just enforces a global lock
// Note this here is probably not enough, as concurrent CNI operations may happen outside of the scope of ocihooks
// through explicit calls to Remove, etc.
// Finally note that this is not the same (albeit similar) as libcni filesystem manipulation locking,
// hence the independent lock
err = os.MkdirAll(cniNetconfPath, 0o700)
if err != nil {
return err
}
lock, err := lockutil.Lock(filepath.Join(cniNetconfPath, ".nerdctl.lock"))
lock, err := lockutil.Lock(filepath.Join(cniNetconfPath, ".cni-concurrency.lock"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.nerdctl.lock is used to lock cni conf file manipulation. This lock here is a global locking mechanism meant to prevent cni plugins from acting concurrently.

This lock is enforced for almost the entirety of the ocihook flow.
It was previously using the same lock, as we were not locking the file operations in cnienv that are used in oci-hooks.

Since we do now, it can no longer use the same lock.

The note around line 100 explains some of that.

if err != nil {
return err
}
Expand Down
Loading