diff --git a/.github/workflows/cloud-runner-integrity.yml b/.github/workflows/cloud-runner-integrity.yml index 55a0b357..95978a52 100644 --- a/.github/workflows/cloud-runner-integrity.yml +++ b/.github/workflows/cloud-runner-integrity.yml @@ -30,6 +30,8 @@ jobs: runs-on: ubuntu-latest env: K3D_NODE_CONTAINERS: 'k3d-unity-builder-agent-0' + AWS_FORCE_PROVIDER: aws + RESOURCE_TRACKING: 'true' steps: # ========================================== # SETUP SECTION @@ -223,66 +225,6 @@ jobs: kubectl run test-localstack --image=curlimages/curl --rm -i --restart=Never --timeout=10s -- \ curl -v --max-time 5 http://host.k3d.internal:4566/_localstack/health 2>&1 | head -20 || \ echo "Cluster connectivity test - if this fails, LocalStack may not be accessible from k3d" - - name: Pre-pull Unity image into k3d cluster - timeout-minutes: 15 - run: | - # Pre-pull the Unity image into the k3d cluster before running tests - # This ensures it's cached in the k3d node's containerd and won't need to be pulled during test execution - UNITY_IMAGE="unityci/editor:ubuntu-2021.3.45f1-base-3" - - # Check disk space before pulling - echo "Checking disk space before pre-pulling Unity image..." - K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0 k3d-unity-builder-server-0}" - for NODE in $K3D_NODE_CONTAINERS; do - echo "Disk space in $NODE:" - docker exec "$NODE" sh -c "df -h /var/lib/rancher/k3s 2>/dev/null || df -h / 2>/dev/null || true" || true - done - - # Clean up before pulling to ensure we have space - echo "Cleaning up before pre-pulling image..." - for NODE in $K3D_NODE_CONTAINERS; do - docker exec "$NODE" sh -c "crictl rm --all 2>/dev/null || true" || true - # Only remove non-Unity images to preserve space while keeping Unity image if already cached - docker exec "$NODE" sh -c "for img in \$(crictl images -q 2>/dev/null); do repo=\$(crictl inspecti \$img --format '{{.repo}}' 2>/dev/null || echo ''); if echo \"\$repo\" | grep -qvE 'unityci/editor|unity'; then crictl rmi \$img 2>/dev/null || true; fi; done" || true - done || true - - # Explicitly pull the image on BOTH nodes to ensure it's cached wherever pods might be scheduled - # This prevents "no space left" errors when pods are scheduled on nodes without the cached image - echo "Pulling Unity image directly on each node to ensure it's cached..." - for NODE in $K3D_NODE_CONTAINERS; do - echo "Checking if image already exists on $NODE..." - IMAGE_EXISTS=$(docker exec "$NODE" sh -c "crictl images | grep -q unityci/editor && echo 'yes' || echo 'no'" || echo "no") - if [ "$IMAGE_EXISTS" = "yes" ]; then - echo "Unity image already cached on $NODE, skipping pull" - else - echo "Pulling Unity image on $NODE (this may take several minutes for 3.9GB image)..." - # Use crictl pull directly in the node's containerd - # This ensures the image is cached in the node's local storage - # Use timeout to prevent hanging indefinitely (10 minutes max) - if timeout 600 docker exec "$NODE" sh -c "crictl pull $UNITY_IMAGE 2>&1"; then - echo "Successfully pulled image on $NODE" - # Verify it's cached - docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Warning: Image not found after pull'" || true - else - PULL_EXIT_CODE=$? - if [ $PULL_EXIT_CODE -eq 124 ]; then - echo "Warning: Image pull on $NODE timed out after 10 minutes. Checking if partially cached..." - else - echo "Warning: Image pull on $NODE failed (exit code: $PULL_EXIT_CODE). Checking if partially cached..." - fi - docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Image not found on $NODE'" || true - echo "Note: Pods scheduled on $NODE will attempt to pull the image during runtime, which may fail if disk space is insufficient." - fi - fi - done - - # Verify image is cached - echo "Checking if Unity image is cached..." - for NODE in $K3D_NODE_CONTAINERS; do - docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Image not found in $NODE'" || true - done - - echo "Image pre-pull completed. Image should be cached in k3d node." - name: Clean up K8s test resources before tests run: | echo "Cleaning up K8s test resources..." diff --git a/action.yml b/action.yml index 791df50e..5dae3ac7 100644 --- a/action.yml +++ b/action.yml @@ -194,6 +194,10 @@ inputs: description: '[CloudRunner] Either local, k8s or aws can be used to run builds on a remote cluster. Additional parameters must be configured.' + resourceTracking: + default: 'false' + required: false + description: '[CloudRunner] Enable resource tracking logs for disk usage and allocation summaries.' containerCpu: default: '' required: false diff --git a/src/model/cloud-runner/cloud-runner.ts b/src/model/cloud-runner/cloud-runner.ts index 9b44a7bf..4c41e7cd 100644 --- a/src/model/cloud-runner/cloud-runner.ts +++ b/src/model/cloud-runner/cloud-runner.ts @@ -19,6 +19,7 @@ import SharedWorkspaceLocking from './services/core/shared-workspace-locking'; import { FollowLogStreamService } from './services/core/follow-log-stream-service'; import CloudRunnerResult from './services/core/cloud-runner-result'; import CloudRunnerOptions from './options/cloud-runner-options'; +import ResourceTracking from './services/core/resource-tracking'; class CloudRunner { public static Provider: ProviderInterface; @@ -37,6 +38,8 @@ class CloudRunner { CloudRunnerLogger.setup(); CloudRunnerLogger.log(`Setting up cloud runner`); CloudRunner.buildParameters = buildParameters; + ResourceTracking.logAllocationSummary('setup'); + await ResourceTracking.logDiskUsageSnapshot('setup'); if (CloudRunner.buildParameters.githubCheckId === ``) { CloudRunner.buildParameters.githubCheckId = await GitHub.createGitHubCheck(CloudRunner.buildParameters.buildGuid); } diff --git a/src/model/cloud-runner/options/cloud-runner-options.ts b/src/model/cloud-runner/options/cloud-runner-options.ts index a1145fb8..661d583d 100644 --- a/src/model/cloud-runner/options/cloud-runner-options.ts +++ b/src/model/cloud-runner/options/cloud-runner-options.ts @@ -287,6 +287,10 @@ class CloudRunnerOptions { return CloudRunnerOptions.getInput('asyncCloudRunner') === 'true'; } + public static get resourceTracking(): boolean { + return CloudRunnerOptions.getInput('resourceTracking') === 'true'; + } + public static get useLargePackages(): boolean { return CloudRunnerOptions.getInput(`useLargePackages`) === `true`; } diff --git a/src/model/cloud-runner/providers/aws/index.ts b/src/model/cloud-runner/providers/aws/index.ts index d57febdd..93fa1d46 100644 --- a/src/model/cloud-runner/providers/aws/index.ts +++ b/src/model/cloud-runner/providers/aws/index.ts @@ -15,6 +15,7 @@ import { ProviderWorkflow } from '../provider-workflow'; import { TaskService } from './services/task-service'; import CloudRunnerOptions from '../../options/cloud-runner-options'; import { AwsClientFactory } from './aws-client-factory'; +import ResourceTracking from '../../services/core/resource-tracking'; class AWSBuildEnvironment implements ProviderInterface { private baseStackName: string; @@ -90,6 +91,8 @@ class AWSBuildEnvironment implements ProviderInterface { secrets: CloudRunnerSecret[], ): Promise { process.env.AWS_REGION = Input.region; + ResourceTracking.logAllocationSummary('aws workflow'); + await ResourceTracking.logDiskUsageSnapshot('aws workflow (host)'); AwsClientFactory.getECS(); const CF = AwsClientFactory.getCloudFormation(); AwsClientFactory.getKinesis(); diff --git a/src/model/cloud-runner/providers/k8s/index.ts b/src/model/cloud-runner/providers/k8s/index.ts index 2f4bd1f1..b53403cc 100644 --- a/src/model/cloud-runner/providers/k8s/index.ts +++ b/src/model/cloud-runner/providers/k8s/index.ts @@ -17,6 +17,7 @@ import { ProviderWorkflow } from '../provider-workflow'; import { RemoteClientLogger } from '../../remote-client/remote-client-logger'; import { KubernetesRole } from './kubernetes-role'; import { CloudRunnerSystem } from '../../services/core/cloud-runner-system'; +import ResourceTracking from '../../services/core/resource-tracking'; class Kubernetes implements ProviderInterface { public static Instance: Kubernetes; @@ -137,6 +138,9 @@ class Kubernetes implements ProviderInterface { ): Promise { try { CloudRunnerLogger.log('Cloud Runner K8s workflow!'); + ResourceTracking.logAllocationSummary('k8s workflow'); + await ResourceTracking.logDiskUsageSnapshot('k8s workflow (host)'); + await ResourceTracking.logK3dNodeDiskUsage('k8s workflow (before job)'); // Setup const id = diff --git a/src/model/cloud-runner/remote-client/index.ts b/src/model/cloud-runner/remote-client/index.ts index 7ad50d49..8412f717 100644 --- a/src/model/cloud-runner/remote-client/index.ts +++ b/src/model/cloud-runner/remote-client/index.ts @@ -14,11 +14,13 @@ import GitHub from '../../github'; import BuildParameters from '../../build-parameters'; import { Cli } from '../../cli/cli'; import CloudRunnerOptions from '../options/cloud-runner-options'; +import ResourceTracking from '../services/core/resource-tracking'; export class RemoteClient { @CliFunction(`remote-cli-pre-build`, `sets up a repository, usually before a game-ci build`) static async setupRemoteClient() { CloudRunnerLogger.log(`bootstrap game ci cloud runner...`); + await ResourceTracking.logDiskUsageSnapshot('remote-cli-pre-build (start)'); if (!(await RemoteClient.handleRetainedWorkspace())) { await RemoteClient.bootstrapRepository(); } @@ -206,6 +208,7 @@ export class RemoteClient { // that read from the log file rather than stdout RemoteClientLogger.log(successMessage); CloudRunnerLogger.log(successMessage); + await ResourceTracking.logDiskUsageSnapshot('remote-cli-post-build (end)'); return new Promise((result) => result(``)); } diff --git a/src/model/cloud-runner/services/core/resource-tracking.ts b/src/model/cloud-runner/services/core/resource-tracking.ts new file mode 100644 index 00000000..a6a2d004 --- /dev/null +++ b/src/model/cloud-runner/services/core/resource-tracking.ts @@ -0,0 +1,84 @@ +import CloudRunnerLogger from './cloud-runner-logger'; +import CloudRunnerOptions from '../../options/cloud-runner-options'; +import CloudRunner from '../../cloud-runner'; +import { CloudRunnerSystem } from './cloud-runner-system'; + +class ResourceTracking { + static isEnabled(): boolean { + return ( + CloudRunnerOptions.resourceTracking || + CloudRunnerOptions.cloudRunnerDebug || + process.env['cloudRunnerTests'] === 'true' + ); + } + + static logAllocationSummary(context: string) { + if (!ResourceTracking.isEnabled()) { + return; + } + + const buildParameters = CloudRunner.buildParameters; + const allocations = { + providerStrategy: buildParameters.providerStrategy, + containerCpu: buildParameters.containerCpu, + containerMemory: buildParameters.containerMemory, + dockerCpuLimit: buildParameters.dockerCpuLimit, + dockerMemoryLimit: buildParameters.dockerMemoryLimit, + kubeVolumeSize: buildParameters.kubeVolumeSize, + kubeStorageClass: buildParameters.kubeStorageClass, + kubeVolume: buildParameters.kubeVolume, + containerNamespace: buildParameters.containerNamespace, + storageProvider: buildParameters.storageProvider, + rcloneRemote: buildParameters.rcloneRemote, + dockerWorkspacePath: buildParameters.dockerWorkspacePath, + cacheKey: buildParameters.cacheKey, + maxRetainedWorkspaces: buildParameters.maxRetainedWorkspaces, + useCompressionStrategy: buildParameters.useCompressionStrategy, + useLargePackages: buildParameters.useLargePackages, + ephemeralStorageRequest: process.env['cloudRunnerTests'] === 'true' ? 'not set' : '2Gi', + }; + + CloudRunnerLogger.log(`[ResourceTracking] Allocation summary (${context}):`); + CloudRunnerLogger.log(JSON.stringify(allocations, undefined, 2)); + } + + static async logDiskUsageSnapshot(context: string) { + if (!ResourceTracking.isEnabled()) { + return; + } + + CloudRunnerLogger.log(`[ResourceTracking] Disk usage snapshot (${context})`); + await ResourceTracking.runAndLog('df -h', 'df -h'); + await ResourceTracking.runAndLog('du -sh .', 'du -sh .'); + await ResourceTracking.runAndLog('du -sh ./cloud-runner-cache', 'du -sh ./cloud-runner-cache'); + await ResourceTracking.runAndLog('du -sh ./temp', 'du -sh ./temp'); + await ResourceTracking.runAndLog('du -sh ./logs', 'du -sh ./logs'); + } + + static async logK3dNodeDiskUsage(context: string) { + if (!ResourceTracking.isEnabled()) { + return; + } + + const nodes = ['k3d-unity-builder-agent-0', 'k3d-unity-builder-server-0']; + CloudRunnerLogger.log(`[ResourceTracking] K3d node disk usage (${context})`); + for (const node of nodes) { + await ResourceTracking.runAndLog( + `k3d node ${node}`, + `docker exec ${node} sh -c "df -h /var/lib/rancher/k3s 2>/dev/null || df -h / 2>/dev/null || true" || true`, + ); + } + } + + private static async runAndLog(label: string, command: string) { + try { + const output = await CloudRunnerSystem.Run(command, true, true); + const trimmed = output.trim(); + CloudRunnerLogger.log(`[ResourceTracking] ${label}:\n${trimmed || 'no output'}`); + } catch (error: any) { + CloudRunnerLogger.log(`[ResourceTracking] ${label} failed: ${error?.message || error}`); + } + } +} + +export default ResourceTracking;