mirror of
https://github.com/game-ci/unity-builder.git
synced 2026-02-02 22:59:06 +08:00
pr feedback
This commit is contained in:
216
.github/workflows/cloud-runner-integrity.yml
vendored
216
.github/workflows/cloud-runner-integrity.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
||||
name: Cloud Runner Tests (K8s)
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
K3D_NODE_CONTAINERS: "k3d-unity-builder-agent-0"
|
||||
K3D_NODE_CONTAINERS: 'k3d-unity-builder-agent-0'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Set up kubectl
|
||||
uses: azure/setup-kubectl@v4
|
||||
with:
|
||||
version: 'v1.34.1'
|
||||
version: 'v1.31.0'
|
||||
- name: Install k3d
|
||||
run: |
|
||||
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
|
||||
@@ -62,12 +62,14 @@ jobs:
|
||||
- name: Create k3s cluster (k3d)
|
||||
timeout-minutes: 5
|
||||
run: |
|
||||
# Clean up any existing cluster and free disk space before creating new one
|
||||
# Only delete if exists - don't aggressively clean up (may cause issues)
|
||||
k3d cluster delete unity-builder || true
|
||||
docker system prune -af --volumes || true
|
||||
# Create cluster - host.k3d.internal will allow pods to access host services
|
||||
# No port mapping needed - LocalStack is on host, accessible via host.k3d.internal:4566
|
||||
k3d cluster create unity-builder --agents 1 --wait
|
||||
# Create cluster with explicit eviction thresholds to prevent premature evictions
|
||||
# host.k3d.internal will allow pods to access host services (LocalStack)
|
||||
k3d cluster create unity-builder \
|
||||
--agents 1 \
|
||||
--wait \
|
||||
--k3s-arg '--kubelet-arg=eviction-hard=imagefs.available<5%,memory.available<100Mi@agent:*'
|
||||
kubectl config current-context | cat
|
||||
- name: Verify cluster readiness and LocalStack connectivity
|
||||
timeout-minutes: 2
|
||||
@@ -111,40 +113,14 @@ jobs:
|
||||
}
|
||||
cleanup_k3d_nodes
|
||||
docker system prune -af --volumes || true
|
||||
# Wait for disk pressure taints to clear (with timeout)
|
||||
# Check for disk pressure taints (informational only - k3s will manage)
|
||||
echo "Checking for disk pressure taints on nodes..."
|
||||
for i in {1..30}; do
|
||||
if kubectl describe nodes | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||
echo "Disk pressure detected, waiting for it to clear... ($i/30)"
|
||||
cleanup_k3d_nodes
|
||||
docker system prune -af --volumes || true
|
||||
sleep 2
|
||||
else
|
||||
echo "No disk pressure taints found"
|
||||
break
|
||||
fi
|
||||
done
|
||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
||||
- name: Pre-pull Unity image into k3d node
|
||||
timeout-minutes: 5
|
||||
run: |
|
||||
echo "Pre-pulling Unity image into k3d node to avoid evictions during tests..."
|
||||
# Clean up old images first to make space
|
||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
|
||||
done
|
||||
# Pre-pull the Unity image that will be used in tests
|
||||
# This ensures it's cached and doesn't need to be pulled during test execution
|
||||
UNITY_IMAGE="unityci/editor:ubuntu-2021.3.45f1-base-3"
|
||||
echo "Pulling ${UNITY_IMAGE} into k3d node..."
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "crictl pull ${UNITY_IMAGE} 2>&1 || echo 'Image pull failed or already exists'" || true
|
||||
done
|
||||
echo "Image pre-pull completed. Checking disk space..."
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "df -h / | tail -1" || true
|
||||
done
|
||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||
echo "WARNING: Disk pressure taint detected. k3s will manage this automatically."
|
||||
kubectl describe nodes | grep -i taint || true
|
||||
else
|
||||
echo "No disk pressure taints found"
|
||||
fi
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
@@ -202,155 +178,9 @@ jobs:
|
||||
# Clean up disk space - aggressive cleanup to prevent evictions
|
||||
rm -rf ./cloud-runner-cache/* || true
|
||||
docker system prune -af --volumes || true
|
||||
# Clean up disk space on k3d node to prevent ephemeral-storage evictions and disk pressure
|
||||
echo "Cleaning up disk space on k3d node..."
|
||||
# Use containerd/crictl commands (docker not available in k3d nodes)
|
||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
||||
cleanup_k3d_nodes() {
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "
|
||||
crictl rmi --prune 2>/dev/null || true
|
||||
crictl rmp --all 2>/dev/null || true
|
||||
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
|
||||
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
||||
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
||||
find /var/log -type f -name '*.log' -delete 2>/dev/null || true
|
||||
find /tmp -type f -delete 2>/dev/null || true
|
||||
find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true
|
||||
df -h /
|
||||
" || true
|
||||
done
|
||||
}
|
||||
cleanup_k3d_nodes
|
||||
# Clean up containerd snapshots and images more aggressively
|
||||
cleanup_k3d_nodes
|
||||
# Wait for disk pressure taints to clear before proceeding
|
||||
echo "Checking for disk pressure taints..."
|
||||
for i in {1..20}; do
|
||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||
echo "Disk pressure detected, cleaning up and waiting... ($i/20)"
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "
|
||||
crictl rmi --prune 2>/dev/null || true
|
||||
crictl rmp --all 2>/dev/null || true
|
||||
crictl images -q | xargs -r crictl rmi 2>/dev/null || true
|
||||
find /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
||||
find /var/lib/rancher/k3s/storage -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} + 2>/dev/null || true
|
||||
" || true
|
||||
done
|
||||
docker system prune -af --volumes || true
|
||||
sleep 3
|
||||
else
|
||||
echo "No disk pressure taints found, proceeding with test"
|
||||
break
|
||||
fi
|
||||
done
|
||||
- name: Ensure disk pressure cleared before test
|
||||
timeout-minutes: 3
|
||||
run: |
|
||||
echo "Ensuring disk pressure is cleared before test..."
|
||||
rm -rf ./cloud-runner-cache/* || true
|
||||
docker system prune -af --volumes || true
|
||||
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0}"
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "docker system prune -af --volumes 2>/dev/null || true" || true
|
||||
done
|
||||
# Wait for disk pressure taints to clear (with aggressive cleanup)
|
||||
# Limit to 10 attempts to avoid timeout - if cleanup doesn't work, just remove the taint
|
||||
PREVIOUS_DISK_USAGE=100
|
||||
for i in {1..10}; do
|
||||
HAS_DISK_PRESSURE=$(kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure" && echo "true" || echo "false")
|
||||
if [ "$HAS_DISK_PRESSURE" = "true" ]; then
|
||||
echo "Disk pressure detected, cleaning up aggressively... ($i/10)"
|
||||
# Check actual disk usage on the node
|
||||
PRIMARY_NODE=$(echo "$K3D_NODE_CONTAINERS" | awk '{print $1}')
|
||||
DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
|
||||
echo "Current disk usage on k3d node: ${DISK_USAGE}%"
|
||||
|
||||
# Use k3s/containerd commands instead of docker (docker not available in k3d nodes)
|
||||
# Clean up k3s containerd snapshots and images
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "crictl rmi --prune 2>/dev/null || true" || true
|
||||
docker exec "$NODE" sh -c "crictl rmp --all 2>/dev/null || true" || true
|
||||
done
|
||||
# Clean up old containerd snapshots
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s/agent/containerd -type d -name 'snapshots' -exec rm -rf {}/* 2>/dev/null \; || true" || true
|
||||
done
|
||||
# Clean up k3s logs and temp files
|
||||
for NODE in $K3D_NODE_CONTAINERS; do
|
||||
docker exec "$NODE" sh -c "find /var/lib/rancher/k3s -type f -name '*.log' -delete 2>/dev/null || true" || true
|
||||
docker exec "$NODE" sh -c "find /tmp -type f -mtime +0 -delete 2>/dev/null || true" || true
|
||||
docker exec "$NODE" sh -c "find /var/log -type f -name '*.log' -mtime +0 -delete 2>/dev/null || true" || true
|
||||
done
|
||||
# Clean up host docker
|
||||
docker system prune -af --volumes || true
|
||||
|
||||
# Check if disk usage improved
|
||||
NEW_DISK_USAGE=$(docker exec "$PRIMARY_NODE" sh -c "df -h / 2>/dev/null | tail -1 | awk '{print \$5}' | sed 's/%//'" || echo "unknown")
|
||||
if [ "$NEW_DISK_USAGE" != "unknown" ] && [ "$PREVIOUS_DISK_USAGE" != "unknown" ]; then
|
||||
if [ "$NEW_DISK_USAGE" -ge "$PREVIOUS_DISK_USAGE" ] && [ "$i" -ge 3 ]; then
|
||||
echo "Disk usage not improving (${PREVIOUS_DISK_USAGE}% -> ${NEW_DISK_USAGE}%), breaking cleanup loop and removing taint manually"
|
||||
break
|
||||
fi
|
||||
PREVIOUS_DISK_USAGE=$NEW_DISK_USAGE
|
||||
fi
|
||||
sleep 3
|
||||
else
|
||||
echo "No disk pressure taints found, proceeding with test"
|
||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
||||
break
|
||||
fi
|
||||
done
|
||||
# If disk pressure taint is still present after cleanup, manually remove it (CI only)
|
||||
# Try multiple times as Kubernetes may re-add it if condition persists
|
||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||
echo "WARNING: Disk pressure taint still present after cleanup. Manually removing taint for CI..."
|
||||
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
||||
for node in $NODE_NAMES; do
|
||||
# Try removing with NoSchedule effect (most common)
|
||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
|
||||
# Also try without effect specifier
|
||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
||||
# Use patch as fallback
|
||||
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
|
||||
done
|
||||
sleep 2
|
||||
echo "Taint removal attempted. Checking nodes..."
|
||||
kubectl describe nodes | grep -i taint || echo "No taints found"
|
||||
fi
|
||||
# Wait for disk pressure condition to clear (not just taint)
|
||||
echo "Waiting for disk pressure condition to clear on nodes..."
|
||||
for i in {1..20}; do
|
||||
HAS_DISK_PRESSURE_CONDITION=$(kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"' && echo "true" || echo "false")
|
||||
if [ "$HAS_DISK_PRESSURE_CONDITION" = "true" ]; then
|
||||
echo "Disk pressure condition still present, waiting... ($i/20)"
|
||||
sleep 2
|
||||
else
|
||||
echo "Disk pressure condition cleared, proceeding with test"
|
||||
break
|
||||
fi
|
||||
done
|
||||
# Final check - if condition still exists, remove taint and wait a bit more
|
||||
if kubectl get nodes -o json 2>/dev/null | grep -q '"type":"DiskPressure"'; then
|
||||
echo "WARNING: Disk pressure condition still exists. Removing taint and waiting 10 seconds..."
|
||||
NODE_NAMES=$(kubectl get nodes -o name 2>/dev/null | sed 's/node\///' || echo "")
|
||||
for node in $NODE_NAMES; do
|
||||
# Try removing with NoSchedule effect (most common)
|
||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure:NoSchedule- 2>/dev/null || true
|
||||
# Also try without effect specifier
|
||||
kubectl taint nodes "$node" node.kubernetes.io/disk-pressure- 2>/dev/null || true
|
||||
# Use patch as fallback to remove all taints
|
||||
kubectl patch node "$node" -p '{"spec":{"taints":[]}}' 2>/dev/null || true
|
||||
done
|
||||
sleep 10
|
||||
# Verify taint is actually removed
|
||||
if kubectl describe nodes 2>/dev/null | grep -q "node.kubernetes.io/disk-pressure"; then
|
||||
echo "ERROR: Taint still present after removal attempts. This may cause pod scheduling issues."
|
||||
else
|
||||
echo "Taint successfully removed."
|
||||
fi
|
||||
fi
|
||||
# Simple cleanup - trust k3s to manage resources
|
||||
echo "Cleaning up test resources..."
|
||||
docker system prune -f || true
|
||||
- name: Run cloud-runner-image test (validate image creation)
|
||||
timeout-minutes: 10
|
||||
run: yarn run test "cloud-runner-image" --detectOpenHandles --forceExit --runInBand
|
||||
@@ -364,7 +194,7 @@ jobs:
|
||||
versioning: None
|
||||
KUBE_STORAGE_CLASS: local-path
|
||||
PROVIDER_STRATEGY: k8s
|
||||
KUBE_VOLUME_SIZE: 5Gi
|
||||
KUBE_VOLUME_SIZE: 2Gi
|
||||
containerCpu: '1000'
|
||||
containerMemory: '1024'
|
||||
AWS_ACCESS_KEY_ID: test
|
||||
@@ -495,7 +325,7 @@ jobs:
|
||||
versioning: None
|
||||
KUBE_STORAGE_CLASS: local-path
|
||||
PROVIDER_STRATEGY: k8s
|
||||
KUBE_VOLUME_SIZE: 5Gi
|
||||
KUBE_VOLUME_SIZE: 2Gi
|
||||
ENABLE_K8S_E2E: 'true'
|
||||
containerCpu: '1000'
|
||||
containerMemory: '1024'
|
||||
@@ -825,7 +655,7 @@ jobs:
|
||||
versioning: None
|
||||
KUBE_STORAGE_CLASS: local-path
|
||||
PROVIDER_STRATEGY: k8s
|
||||
KUBE_VOLUME_SIZE: 5Gi
|
||||
KUBE_VOLUME_SIZE: 2Gi
|
||||
# Set resource requests for tests - increased memory to prevent OOM kills
|
||||
containerCpu: '1000'
|
||||
containerMemory: '1024'
|
||||
@@ -945,7 +775,7 @@ jobs:
|
||||
versioning: None
|
||||
KUBE_STORAGE_CLASS: local-path
|
||||
PROVIDER_STRATEGY: k8s
|
||||
KUBE_VOLUME_SIZE: 5Gi
|
||||
KUBE_VOLUME_SIZE: 2Gi
|
||||
containerCpu: '512'
|
||||
containerMemory: '512'
|
||||
AWS_ACCESS_KEY_ID: test
|
||||
|
||||
Reference in New Issue
Block a user