pr feedback - handle evictions and wait for disk pressure condition

2026-02-01 06:09:07 +08:00 · 2025-12-29 18:01:33 +00:00
parent ed0d2c13b6
commit f4d28fa6d2
4 changed files with 64 additions and 1 deletions
--- a/.github/workflows/cloud-runner-integrity.yml
+++ b/.github/workflows/cloud-runner-integrity.yml
@@ -242,6 +242,27 @@ jobs:
            echo "Taint removed. Checking nodes..."
            kubectl describe nodes | grep -i taint || echo "No taints found"
          fi
+          # Wait for disk pressure condition to clear (not just taint)
+          echo "Waiting for disk pressure condition to clear on nodes..."
+          for i in {1..20}; do
+            HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
+            if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
+              echo "Disk pressure condition still present, waiting... ($i/20)"
+              sleep 2
+            else
+              echo "Disk pressure condition cleared, proceeding with test"
+              break
+            fi
+          done
+          # Final check - if condition still exists, remove taint and wait a bit more
+          if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
+            echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
+            NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
+            for node in $NODE_NAMES; do
+              kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
+            done
+            sleep 10
+          fi
      - name: Run cloud-runner-image test (validate image creation)
        timeout-minutes: 10
        run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
@@ -471,6 +492,27 @@ jobs:
            echo "Taint removed. Checking nodes..."
            kubectl describe nodes | grep -i taint || echo "No taints found"
          fi
+          # Wait for disk pressure condition to clear (not just taint)
+          echo "Waiting for disk pressure condition to clear on nodes..."
+          for i in {1..20}; do
+            HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
+            if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
+              echo "Disk pressure condition still present, waiting... ($i/20)"
+              sleep 2
+            else
+              echo "Disk pressure condition cleared, proceeding with test"
+              break
+            fi
+          done
+          # Final check - if condition still exists, remove taint and wait a bit more
+          if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
+            echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
+            NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
+            for node in $NODE_NAMES; do
+              kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
+            done
+            sleep 10
+          fi
      - name: Run cloud-runner-s3-steps test (validate S3 operations with K8s)
        timeout-minutes: 30
        run: yarn run test "cloud-runner-s3-steps" --detectOpenHandles --forceExit --runInBand
--- a/dist/index.js
+++ b/dist/index.js
@@ -4147,6 +4147,14 @@ class KubernetesPods {
                cloud_runner_logger_1.default.logWarning(`Pod ${podName} has PreStopHook failure but no container failure detected. Treating as non-fatal.`);
                return false; // PreStopHook failure alone is not fatal if container status is unclear
            }
+            // Check if pod was evicted due to disk pressure - this is an infrastructure issue
+            const wasEvicted = errorDetails.some((detail) => detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'));
+            if (wasEvicted) {
+                const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
+                cloud_runner_logger_1.default.logWarning(evictionMessage);
+                cloud_runner_logger_1.default.log(`Pod details: ${errorDetails.join('\n')}`);
+                throw new Error(`${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`);
+            }
            // Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
            // If this happened with PreStopHook failure, it might be a resource issue, not a real failure
            // Be lenient if we only have PreStopHook/ExceededGracePeriod issues
--- a/dist/index.js.map
+++ b/dist/index.js.map
--- a/src/model/cloud-runner/providers/k8s/kubernetes-pods.ts
+++ b/src/model/cloud-runner/providers/k8s/kubernetes-pods.ts
@@ -147,6 +147,19 @@ class KubernetesPods {
        return false; // PreStopHook failure alone is not fatal if container status is unclear
      }

+      // Check if pod was evicted due to disk pressure - this is an infrastructure issue
+      const wasEvicted = errorDetails.some((detail) =>
+        detail.toLowerCase().includes('evicted') || detail.toLowerCase().includes('diskpressure'),
+      );
+      if (wasEvicted) {
+        const evictionMessage = `Pod ${podName} was evicted due to disk pressure. This is a test infrastructure issue - the cluster doesn't have enough disk space.`;
+        CloudRunnerLogger.logWarning(evictionMessage);
+        CloudRunnerLogger.log(`Pod details: ${errorDetails.join('\n')}`);
+        throw new Error(
+          `${evictionMessage}\nThis indicates the test environment needs more disk space or better cleanup.\n${errorDetails.join('\n')}`,
+        );
+      }
+
      // Exit code 137 (128 + 9) means SIGKILL - container was killed by system (often OOM)
      // If this happened with PreStopHook failure, it might be a resource issue, not a real failure
      // Be lenient if we only have PreStopHook/ExceededGracePeriod issues