diff --git a/contrib/test/ci/vars.yml b/contrib/test/ci/vars.yml index 242e09e5286..17fcb89f9db 100644 --- a/contrib/test/ci/vars.yml +++ b/contrib/test/ci/vars.yml @@ -154,6 +154,7 @@ kata_skip_network_tests: - 'test "Connect to pod hostport from the host"' - 'test "Clean up network if pod sandbox fails"' - 'test "Clean up network if pod sandbox gets killed"' + - 'test "Network recovery after reboot with destroyed netns"' kata_skip_pod_tests: - 'test "pass pod sysctls to runtime"' - 'test "pass pod sysctls to runtime when in userns"' diff --git a/internal/storage/image_test.go b/internal/storage/image_test.go index 65edd52a4fd..b71d141b6ed 100644 --- a/internal/storage/image_test.go +++ b/internal/storage/image_test.go @@ -29,7 +29,6 @@ var _ = t.Describe("Image", func() { testDockerRegistry = "docker.io" testQuayRegistry = "quay.io" testRedHatRegistry = "registry.access.redhat.com" - testFedoraRegistry = "registry.fedoraproject.org" testImageName = "image" testImageAlias = "image-for-testing" testImageAliasResolved = "registry.crio.test.com/repo" @@ -193,7 +192,6 @@ var _ = t.Describe("Image", func() { Expect(refsToNames(refs)).To(Equal([]string{ testQuayRegistry + "/" + testImageName + ":latest", testRedHatRegistry + "/" + testImageName + ":latest", - testFedoraRegistry + "/" + testImageName + ":latest", testDockerRegistry + "/library/" + testImageName + ":latest", })) }) @@ -247,7 +245,6 @@ var _ = t.Describe("Image", func() { Expect(refsToNames(refs)).To(Equal([]string{ testQuayRegistry + "/" + testImageName + "@sha256:" + testSHA256, testRedHatRegistry + "/" + testImageName + "@sha256:" + testSHA256, - testFedoraRegistry + "/" + testImageName + "@sha256:" + testSHA256, testDockerRegistry + "/library/" + testImageName + "@sha256:" + testSHA256, })) }) diff --git a/server/sandbox_network.go b/server/sandbox_network.go index dd4e431f343..4c86f501c94 100644 --- a/server/sandbox_network.go +++ b/server/sandbox_network.go @@ -181,23 +181,42 @@ func (s *Server) networkStop(ctx context.Context, sb *sandbox.Sandbox) error { podNetwork, err := s.newPodNetwork(ctx, sb) if err != nil { - return err + return fmt.Errorf("failed to create pod network for sandbox %s(%s): %w", sb.Name(), sb.ID(), err) } - if err := s.config.CNIPlugin().TearDownPodWithContext(stopCtx, podNetwork); err != nil { - retErr := fmt.Errorf("failed to destroy network for pod sandbox %s(%s): %w", sb.Name(), sb.ID(), err) - + // Check if the network namespace file exists and is valid before attempting CNI teardown. + // If the file doesn't exist or is invalid, skip CNI teardown and mark network as stopped. + if podNetwork.NetNS != "" { if _, statErr := os.Stat(podNetwork.NetNS); statErr != nil { - return fmt.Errorf("%w: stat netns path %q: %w", retErr, podNetwork.NetNS, statErr) + // Network namespace file doesn't exist, mark network as stopped and return success + log.Debugf(ctx, "Network namespace file %s does not exist for pod sandbox %s(%s), skipping CNI teardown", + podNetwork.NetNS, sb.Name(), sb.ID()) + + return sb.SetNetworkStopped(ctx, true) + } + + if validateErr := s.validateNetworkNamespace(podNetwork.NetNS); validateErr != nil { + // Network namespace file exists but is invalid (e.g., corrupted or fake file) + log.Warnf(ctx, "Network namespace file %s is invalid for pod sandbox %s(%s): %v, removing and skipping CNI teardown", + podNetwork.NetNS, sb.Name(), sb.ID(), validateErr) + s.cleanupNetns(ctx, podNetwork.NetNS, sb) + + return sb.SetNetworkStopped(ctx, true) } + } - // The netns file may still exists, which means that it's likely - // corrupted. Remove it to allow cleanup of the network namespace: - if rmErr := os.RemoveAll(podNetwork.NetNS); rmErr != nil { - return fmt.Errorf("%w: failed to remove netns path: %w", retErr, rmErr) + if err := s.config.CNIPlugin().TearDownPodWithContext(stopCtx, podNetwork); err != nil { + log.Warnf(ctx, "Failed to destroy network for pod sandbox %s(%s): %v", sb.Name(), sb.ID(), err) + + // If the network namespace exists but CNI teardown failed, try to clean it up. + if podNetwork.NetNS != "" { + if _, statErr := os.Stat(podNetwork.NetNS); statErr == nil { + // Clean up the netns file since CNI teardown failed. + s.cleanupNetns(ctx, podNetwork.NetNS, sb) + } } - log.Warnf(ctx, "Removed invalid netns path %s from pod sandbox %s(%s)", podNetwork.NetNS, sb.Name(), sb.ID()) + return fmt.Errorf("network teardown failed for pod sandbox %s(%s): %w", sb.Name(), sb.ID(), err) } return sb.SetNetworkStopped(ctx, true) diff --git a/server/sandbox_network_freebsd.go b/server/sandbox_network_freebsd.go new file mode 100644 index 00000000000..511ee579dcb --- /dev/null +++ b/server/sandbox_network_freebsd.go @@ -0,0 +1,24 @@ +//go:build freebsd +// +build freebsd + +package server + +import ( + "context" + + "github.com/cri-o/cri-o/internal/lib/sandbox" + "github.com/cri-o/cri-o/internal/log" +) + +// validateNetworkNamespace checks if the given path is a valid network namespace +// On FreeBSD, this is a no-op since network namespaces are Linux-specific. +func (s *Server) validateNetworkNamespace(netnsPath string) error { + // Network namespaces are Linux-specific, so on FreeBSD we assume it's valid + return nil +} + +// cleanupNetns removes a network namespace file and logs the action +// On FreeBSD, this is a no-op since network namespaces are Linux-specific. +func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) { + log.Debugf(ctx, "Network namespace cleanup not supported on this platform") +} diff --git a/server/sandbox_network_linux.go b/server/sandbox_network_linux.go new file mode 100644 index 00000000000..1d4ae931cd0 --- /dev/null +++ b/server/sandbox_network_linux.go @@ -0,0 +1,36 @@ +//go:build linux +// +build linux + +package server + +import ( + "context" + "fmt" + "os" + + "github.com/containernetworking/plugins/pkg/ns" + + "github.com/cri-o/cri-o/internal/lib/sandbox" + "github.com/cri-o/cri-o/internal/log" +) + +// validateNetworkNamespace checks if the given path is a valid network namespace. +func (s *Server) validateNetworkNamespace(netnsPath string) error { + netns, err := ns.GetNS(netnsPath) + if err != nil { + return fmt.Errorf("invalid network namespace: %w", err) + } + + defer netns.Close() + + return nil +} + +// cleanupNetns removes a network namespace file and logs the action. +func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) { + if rmErr := os.RemoveAll(netnsPath); rmErr != nil { + log.Warnf(ctx, "Failed to remove netns path %s: %v", netnsPath, rmErr) + } else { + log.Infof(ctx, "Removed netns path %s from pod sandbox %s(%s)", netnsPath, sb.Name(), sb.ID()) + } +} diff --git a/server/sandbox_network_unsupported.go b/server/sandbox_network_unsupported.go new file mode 100644 index 00000000000..d9067ae46b6 --- /dev/null +++ b/server/sandbox_network_unsupported.go @@ -0,0 +1,23 @@ +//go:build !linux && !freebsd +// +build !linux,!freebsd + +package server + +import ( + "context" + + "github.com/cri-o/cri-o/internal/lib/sandbox" + "github.com/cri-o/cri-o/internal/log" +) + +// validateNetworkNamespace checks if the given path is a valid network namespace +// On unsupported platforms, this is a no-op since network namespaces are Linux-specific. +func (s *Server) validateNetworkNamespace(netnsPath string) error { + return nil +} + +// cleanupNetns removes a network namespace file and logs the action +// On unsupported platforms, this is a no-op since network namespaces are Linux-specific. +func (s *Server) cleanupNetns(ctx context.Context, netnsPath string, sb *sandbox.Sandbox) { + log.Debugf(ctx, "Network namespace cleanup not supported on this platform") +} diff --git a/test/image.bats b/test/image.bats index 10ead5e7df5..0c5c5b3e161 100644 --- a/test/image.bats +++ b/test/image.bats @@ -104,7 +104,9 @@ function teardown() { mkdir -p "$TESTDIR/imagestore" CONTAINER_IMAGESTORE="$TESTDIR/imagestore" start_crio - FEDORA="registry.fedoraproject.org/fedora" + # registry.fedoraproject.org is pretty flaky + # Moving to the stable quay.io + FEDORA="quay.io/fedora/fedora" crictl pull $FEDORA imageid=$(crictl images --quiet "$FEDORA") [ "$imageid" != "" ] diff --git a/test/network.bats b/test/network.bats index 2c89725c66a..09a63e1f818 100644 --- a/test/network.bats +++ b/test/network.bats @@ -185,5 +185,42 @@ function check_networking() { # be able to remove the sandbox crictl rmp -f "$POD" - grep -q "Removed invalid netns path $NETNS_PATH$NS from pod sandbox" "$CRIO_LOG" + grep -q "Removed netns path $NETNS_PATH$NS from pod sandbox" "$CRIO_LOG" +} + +@test "Network recovery after reboot with destroyed netns" { + # This test simulates a reboot scenario where network namespaces are destroyed + # but CRI-O needs to clean up pod network resources gracefully. + + start_crio + + pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + + # Get the network namespace path + NETNS_PATH=/var/run/netns/ + NS=$(crictl inspectp "$pod_id" | + jq -er '.info.runtimeSpec.linux.namespaces[] | select(.type == "network").path | sub("'$NETNS_PATH'"; "")') + + # Remove the network namespace. + ip netns del "$NS" + + # Create a fake netns file. + touch "$NETNS_PATH$NS" + + restart_crio + + # Try to remove the pod. + crictl rmp -f "$pod_id" 2> /dev/null || true + + grep -q "Successfully cleaned up network for pod" "$CRIO_LOG" + + new_pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json) + + # Verify the new pod is running. + output=$(crictl inspectp "$new_pod_id" | jq -r '.status.state') + [[ "$output" == "SANDBOX_READY" ]] + + # Clean up the new pod + crictl stopp "$new_pod_id" + crictl rmp "$new_pod_id" } diff --git a/test/registries.conf b/test/registries.conf index 0fb3be9f31e..c0fc9f53b5e 100644 --- a/test/registries.conf +++ b/test/registries.conf @@ -1,4 +1,4 @@ -unqualified-search-registries = ['quay.io' ,'registry.access.redhat.com', 'registry.fedoraproject.org', 'docker.io'] +unqualified-search-registries = ['quay.io', 'registry.access.redhat.com', 'docker.io'] [aliases] "image-for-testing" = "registry.crio.test.com/repo" diff --git a/test/testdata/Dockerfile b/test/testdata/Dockerfile index 59c6ce56c69..8ee16e75d92 100644 --- a/test/testdata/Dockerfile +++ b/test/testdata/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.fedoraproject.org/fedora-minimal:38 +FROM quay.io/fedora/fedora-minimal:38 RUN microdnf install -y coreutils \ gcc \ gzip \