humanlayer · allisoneer · May 12, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
@@ -0,0 +1,148 @@
+name: E2E Tests
+
+on:
+  push:
+    branches: [ "main" ]
+    paths:
+      - "acp/**"
+      - ".github/workflows/e2e-tests.yml"
+  pull_request:
+    branches: [ "main" ]
+    paths:
+      - "acp/**"
+      - ".github/workflows/e2e-tests.yml"
+  # Allow manual triggering
+  workflow_dispatch:
+
+jobs:
+  # E2E tests don't need to run unit tests again, those are run in the Go CI workflow
+  # We only need to build a Docker image for E2E tests
+  docker-build:
+    name: Build Docker Image
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.24'
+          cache: true
+          cache-dependency-path: acp/go.sum
+
+      - name: Cache acp tools
+        uses: actions/cache@v4
+        with:
+          path: acp/bin
+          key: ${{ runner.os }}-acp-bin-${{ hashFiles('acp/Makefile') }}
+          restore-keys: |
+            ${{ runner.os }}-acp-bin-
+
+      - name: Build Docker image
+        working-directory: acp
+        run: make docker-build IMG=example.com/acp:v0.0.1
+
+  e2e-test:
+    name: End-to-End Tests
+    needs: docker-build
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.24'
+          cache: true
+          cache-dependency-path: acp/go.sum
+
+      - name: Cache acp tools
+        uses: actions/cache@v4
+        with:
+          path: acp/bin
+          key: ${{ runner.os }}-acp-bin-${{ hashFiles('acp/Makefile') }}
+          restore-keys: |
+            ${{ runner.os }}-acp-bin-
+
+      # Setup KinD using the engineerd action with updated version
+      - name: Setup KinD
+        uses: engineerd/setup-kind@v0.5.0
+        with:
+          version: "v0.20.0"
+          name: "kind"
+          config: "acp-example/kind/kind-config.yaml"
+          wait: "300s"
+
+      - name: Verify KinD cluster
+        run: |
+          kubectl cluster-info
+          kubectl get nodes
+          echo "KinD cluster created successfully!"
+
+      # Setup cert-manager which is required for the controller
+      - name: Install cert-manager
+        run: |
+          kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.2/cert-manager.yaml
+          kubectl wait --for=condition=Available --timeout=180s deployment/cert-manager-webhook -n cert-manager
+          echo "Cert-manager installed successfully!"
+
+      # Setup prometheus operator for metrics (using older version to avoid annotation size issue)
+      - name: Install Prometheus Operator
+        run: |
+          # Use an older version (v0.58.0) which doesn't have the annotation size issue
+          kubectl apply -f https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.58.0/bundle.yaml || true
+          # Wait with a longer timeout but don't fail if it's not ready
+          kubectl wait --for=condition=Available --timeout=300s deployment/prometheus-operator -n default || true
+          echo "Prometheus operator installation attempted - continuing regardless of outcome"
+
+      # Load Docker image into the kind cluster for the controller
+      - name: Load the controller image
+        run: |
+          # The image was already built in the previous job, now just load it into kind
+          # First check if image exists
+          docker image inspect example.com/acp:v0.0.1 || docker pull example.com/acp:v0.0.1 || (cd acp && make docker-build IMG=example.com/acp:v0.0.1)
+          # Load the image into the kind cluster
+          kind load docker-image example.com/acp:v0.0.1 --name kind
+          echo "Docker image loaded successfully!"
+
+      # Run E2E tests
+      - name: Run e2e tests
+        working-directory: acp
+        timeout-minutes: 10
+        env:
+          # Set environment variables that might be needed by the tests
+          KUBECONFIG: /home/runner/.kube/config
+        run: |
+          echo "Running e2e tests..."
+          make test-e2e
+
+      # Upload test logs on failure for debugging
+      - name: Upload test logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: e2e-test-logs
+          path: |
+            /tmp/*.log
+            /home/runner/.kube/config
+          retention-days: 7
+
+      # Collect diagnostic information
+      - name: Collect diagnostic information
+        if: always()
+        run: |
+          echo "==== Kubernetes Nodes ===="
+          kubectl get nodes -o wide || true
+
+          echo "==== Kubernetes Pods ===="
+          kubectl get pods -A || true
+
+          echo "==== Pod Logs ===="
+          kubectl logs -l control-plane=controller-manager -n default || true
+
+          echo "==== Events ===="
+          kubectl get events --sort-by='.lastTimestamp' -A || true
@@ -106,65 +106,18 @@ jobs:
         working-directory: acp
         run: make build
 
-  # E2E tests are temporarily disabled due to configuration issues
-  # 
-  # Issues encountered:
-  # 1. The e2e test suite has a hardcoded image name "example.com/acp:v0.0.1" in e2e_suite_test.go
-  # 2. The test expects controller-manager pods to be created in the acp-system namespace
-  # 3. Attempts to fix:
-  #    - Setting KIND_CLUSTER environment variable to match the KinD cluster name
-  #    - Modifying the Makefile to check for the correct cluster name
-  #    - Trying to use the same image name that's hardcoded in the tests
-  # 4. The controller-manager pods never get created successfully during CI
+  # E2E tests are now run in a separate workflow file: .github/workflows/e2e-tests.yml
+  # This provides several benefits:
+  # - Faster CI for regular pushes (as E2E tests can take several minutes)
+  # - Better isolation of test failures
+  # - Ability to trigger E2E tests independently via workflow_dispatch
+  # - Specialized configuration for Kubernetes components
   #
-  # TODO: 
-  # - Fix the e2e test configuration to work properly in CI
-  # - Consider making the test image name configurable instead of hardcoded
-  # - Debug why the controller-manager pods aren't being created/started correctly
-  #
-  # e2e-test:
-  #   name: E2E Tests
-  #   runs-on: ubuntu-latest
-  #   needs: [build]
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #
-  #     - name: Set up Go
-  #       uses: actions/setup-go@v5
-  #       with:
-  #         go-version: '1.24'
-  #         cache: true
-  #         cache-dependency-path: acp/go.sum
-  #
-  #     - name: Setup KinD
-  #       uses: helm/kind-action@v1.9.0
-  #       with:
-  #         cluster_name: acp-example-cluster
-  #         config: acp-example/kind/kind-config.yaml
-  #
-  #     - name: Set timestamp
-  #       id: timestamp
-  #       run: echo "TIMESTAMP=$(date +%Y%m%d%H%M)" >> $GITHUB_ENV
-  #
-  #     - name: Fix test-e2e check for cluster
-  #       working-directory: acp
-  #       run: |
-  #         # Temporarily modify the Makefile to check for acp-example-cluster instead of 'kind'
-  #         sed -i 's/@kind get clusters | grep -q '"'"'kind'"'"'/@kind get clusters | grep -q '"'"'acp-example-cluster'"'"'/' Makefile
-  #
-  #     - name: Build and load controller image
-  #       working-directory: acp
-  #       env:
-  #         IMG: controller:${{ env.TIMESTAMP }}
-  #       run: make docker-build && kind load docker-image controller:${{ env.TIMESTAMP }} --name acp-example-cluster
-  #
-  #     - name: Run e2e tests
-  #       working-directory: acp
-  #       env:
-  #         KIND_CLUSTER: acp-example-cluster
-  #         IMG: controller:${{ env.TIMESTAMP }}
-  #       run: make test-e2e
+  # The e2e-tests.yml workflow addresses the previous issues:
+  # - Uses the exact image name expected by the tests: example.com/acp:v0.0.1
+  # - Properly configures the KinD cluster with the expected name 'kind'
+  # - Sets up required components like cert-manager and Prometheus
+  # - Provides detailed diagnostic output in case of failures
 
   docker:
     name: Docker Build

@@ -29,7 +29,7 @@ ACP (Agent Control Plane) is a cloud-native orchestrator for AI Agents built on
 - [Getting Started](#getting-started)
   - [Prerequisites](#prerequisites)
   - [Setting Up a Local Cluster](#setting-up-a-local-cluster)
-  - [Deploying ACP](#deploying-acp) 
+  - [Deploying ACP](#deploying-acp)
   - [Creating an Agent and Running your first task](#creating-an-agent-and-running-your-first-task)
   - [Adding Tools with MCP](#adding-tools-with-mcp)
   - [Using other language models](#using-other-language-models)
@@ -38,6 +38,7 @@ ACP (Agent Control Plane) is a cloud-native orchestrator for AI Agents built on
   - [Incorporating Humans as Tools](#humans-as-tools)
   - [Cleaning Up](#cleaning-up)
 - [Design Principles](#design-principles)
+- [End-to-End Testing](#end-to-end-testing)
 - [Contributing](#contributing)
 - [License](#license)
 
@@ -1302,6 +1303,52 @@ kind delete cluster
 
 - **Extensibility**: Because agents are YAML, it's easy to build and share agents, tools, and tasks.
 
+## End-to-End Testing
+
+The project includes comprehensive end-to-end tests that validate the full workflow described in this README. These tests:
+
+1. Create a Kind cluster
+2. Deploy the ACP operator
+3. Deploy sample resources (LLMs, MCP Servers, Agents, Tasks)
+4. Deploy the observability stack
+5. Verify all components are running correctly
+6. Test the complete workflow with Task execution
+
+### Running E2E Tests Locally
+
+To run the e2e tests that validate the README workflow locally:
+
+```bash
+make test-e2e
+```
+
+This command:
+- Builds the controller Docker image
+- Loads it into Kind
+- Sets up necessary components (Prometheus, cert-manager if not present)
+- Runs the e2e test suite
+- Verifies resources are created and functioning correctly
+
+### Continuous Integration
+
+The E2E tests automatically run in our CI pipeline for all pull requests to ensure that the system continues to work as described in this README. The CI workflow:
+
+- Builds and tests the ACP system in a clean environment
+- Sets up a dedicated Kubernetes cluster using Kind
+- Deploys all necessary components, including cert-manager and Prometheus
+- Runs the full E2E test suite to verify the entire workflow functions correctly
+- Collects and reports detailed diagnostic information for troubleshooting
+
+This ensures that any changes to the codebase do not break the documented workflow.
+
+### Test Structure
+
+The tests can be found in the `acp/test/e2e` directory:
+- `workflow_test.go` - Contains tests that validate the workflow described in this README
+- `e2e_test.go` - Contains tests for controller metrics and other functionality
+
+These tests serve as both validation of the codebase and as a working example of how to programmatically deploy and verify the ACP system.
+
 ## Roadmap
 
 

@@ -47,29 +47,41 @@ type MCPServerReconciler struct {
 }
 
 // updateStatus updates the status of the MCPServer resource with the latest version
+// This method handles conflicts by retrying the status update up to 3 times
 func (r *MCPServerReconciler) updateStatus(ctx context.Context, req ctrl.Request, statusUpdate *acp.MCPServer) error {
 	logger := log.FromContext(ctx)
+	const maxRetries = 3
+
+	var updateErr error
+	for i := 0; i < maxRetries; i++ {
+		// Get the latest version of the MCPServer
+		var latestMCPServer acp.MCPServer
+		if err := r.Get(ctx, req.NamespacedName, &latestMCPServer); err != nil {
+			logger.Error(err, "Failed to get latest MCPServer before status update")
+			return err
+		}
 
-	// Get the latest version of the MCPServer
-	var latestMCPServer acp.MCPServer
-	if err := r.Get(ctx, req.NamespacedName, &latestMCPServer); err != nil {
-		logger.Error(err, "Failed to get latest MCPServer before status update")
-		return err
-	}
-
-	// Apply status updates to the latest version
-	latestMCPServer.Status.Connected = statusUpdate.Status.Connected
-	latestMCPServer.Status.Status = statusUpdate.Status.Status
-	latestMCPServer.Status.StatusDetail = statusUpdate.Status.StatusDetail
-	latestMCPServer.Status.Tools = statusUpdate.Status.Tools
+		// Apply status updates to the latest version
+		latestMCPServer.Status.Connected = statusUpdate.Status.Connected
+		latestMCPServer.Status.Status = statusUpdate.Status.Status
+		latestMCPServer.Status.StatusDetail = statusUpdate.Status.StatusDetail
+		latestMCPServer.Status.Tools = statusUpdate.Status.Tools
+
+		// Update the status
+		updateErr = r.Status().Update(ctx, &latestMCPServer)
+		if updateErr == nil {
+			// Success - no need for more retries
+			return nil
+		}
 
-	// Update the status
-	if err := r.Status().Update(ctx, &latestMCPServer); err != nil {
-		logger.Error(err, "Failed to update MCPServer status")
-		return err
+		// If conflict, wait briefly and retry
+		logger.Info("Status update conflict, retrying", "attempt", i+1, "error", updateErr)
+		time.Sleep(time.Millisecond * 100)
 	}
 
-	return nil
+	// If we got here, we failed all retries
+	logger.Error(updateErr, "Failed to update MCPServer status after retries")
+	return updateErr
 }
 
 // Reconcile processes the MCPServer resource and establishes a connection to the MCP server
@@ -158,7 +170,7 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 
 	// Update status with tools
 	statusUpdate.Status.Connected = true
-	statusUpdate.Status.Status = "Ready"
+	statusUpdate.Status.Status = StatusReady
 	statusUpdate.Status.StatusDetail = fmt.Sprintf("Connected successfully with %d tools", len(tools))
 	statusUpdate.Status.Tools = tools
 	r.recorder.Event(&mcpServer, corev1.EventTypeNormal, "Connected", "MCP server connected successfully")