This commit is contained in:
frostebite
2026-01-23 21:13:39 +00:00
parent b2cb6ebb19
commit 1cf4f0326b
8 changed files with 107 additions and 60 deletions

View File

@@ -30,6 +30,8 @@ jobs:
runs-on: ubuntu-latest
env:
K3D_NODE_CONTAINERS: 'k3d-unity-builder-agent-0'
AWS_FORCE_PROVIDER: aws
RESOURCE_TRACKING: 'true'
steps:
# ==========================================
# SETUP SECTION
@@ -223,66 +225,6 @@ jobs:
kubectl run test-localstack --image=curlimages/curl --rm -i --restart=Never --timeout=10s -- \
curl -v --max-time 5 http://host.k3d.internal:4566/_localstack/health 2>&1 | head -20 || \
echo "Cluster connectivity test - if this fails, LocalStack may not be accessible from k3d"
- name: Pre-pull Unity image into k3d cluster
timeout-minutes: 15
run: |
# Pre-pull the Unity image into the k3d cluster before running tests
# This ensures it's cached in the k3d node's containerd and won't need to be pulled during test execution
UNITY_IMAGE="unityci/editor:ubuntu-2021.3.45f1-base-3"
# Check disk space before pulling
echo "Checking disk space before pre-pulling Unity image..."
K3D_NODE_CONTAINERS="${K3D_NODE_CONTAINERS:-k3d-unity-builder-agent-0 k3d-unity-builder-server-0}"
for NODE in $K3D_NODE_CONTAINERS; do
echo "Disk space in $NODE:"
docker exec "$NODE" sh -c "df -h /var/lib/rancher/k3s 2>/dev/null || df -h / 2>/dev/null || true" || true
done
# Clean up before pulling to ensure we have space
echo "Cleaning up before pre-pulling image..."
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "crictl rm --all 2>/dev/null || true" || true
# Only remove non-Unity images to preserve space while keeping Unity image if already cached
docker exec "$NODE" sh -c "for img in \$(crictl images -q 2>/dev/null); do repo=\$(crictl inspecti \$img --format '{{.repo}}' 2>/dev/null || echo ''); if echo \"\$repo\" | grep -qvE 'unityci/editor|unity'; then crictl rmi \$img 2>/dev/null || true; fi; done" || true
done || true
# Explicitly pull the image on BOTH nodes to ensure it's cached wherever pods might be scheduled
# This prevents "no space left" errors when pods are scheduled on nodes without the cached image
echo "Pulling Unity image directly on each node to ensure it's cached..."
for NODE in $K3D_NODE_CONTAINERS; do
echo "Checking if image already exists on $NODE..."
IMAGE_EXISTS=$(docker exec "$NODE" sh -c "crictl images | grep -q unityci/editor && echo 'yes' || echo 'no'" || echo "no")
if [ "$IMAGE_EXISTS" = "yes" ]; then
echo "Unity image already cached on $NODE, skipping pull"
else
echo "Pulling Unity image on $NODE (this may take several minutes for 3.9GB image)..."
# Use crictl pull directly in the node's containerd
# This ensures the image is cached in the node's local storage
# Use timeout to prevent hanging indefinitely (10 minutes max)
if timeout 600 docker exec "$NODE" sh -c "crictl pull $UNITY_IMAGE 2>&1"; then
echo "Successfully pulled image on $NODE"
# Verify it's cached
docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Warning: Image not found after pull'" || true
else
PULL_EXIT_CODE=$?
if [ $PULL_EXIT_CODE -eq 124 ]; then
echo "Warning: Image pull on $NODE timed out after 10 minutes. Checking if partially cached..."
else
echo "Warning: Image pull on $NODE failed (exit code: $PULL_EXIT_CODE). Checking if partially cached..."
fi
docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Image not found on $NODE'" || true
echo "Note: Pods scheduled on $NODE will attempt to pull the image during runtime, which may fail if disk space is insufficient."
fi
fi
done
# Verify image is cached
echo "Checking if Unity image is cached..."
for NODE in $K3D_NODE_CONTAINERS; do
docker exec "$NODE" sh -c "crictl images | grep unityci/editor || echo 'Image not found in $NODE'" || true
done
echo "Image pre-pull completed. Image should be cached in k3d node."
- name: Clean up K8s test resources before tests
run: |
echo "Cleaning up K8s test resources..."

View File

@@ -194,6 +194,10 @@ inputs:
description:
'[CloudRunner] Either local, k8s or aws can be used to run builds on a remote cluster. Additional parameters must
be configured.'
resourceTracking:
default: 'false'
required: false
description: '[CloudRunner] Enable resource tracking logs for disk usage and allocation summaries.'
containerCpu:
default: ''
required: false

View File

@@ -19,6 +19,7 @@ import SharedWorkspaceLocking from './services/core/shared-workspace-locking';
import { FollowLogStreamService } from './services/core/follow-log-stream-service';
import CloudRunnerResult from './services/core/cloud-runner-result';
import CloudRunnerOptions from './options/cloud-runner-options';
import ResourceTracking from './services/core/resource-tracking';
class CloudRunner {
public static Provider: ProviderInterface;
@@ -37,6 +38,8 @@ class CloudRunner {
CloudRunnerLogger.setup();
CloudRunnerLogger.log(`Setting up cloud runner`);
CloudRunner.buildParameters = buildParameters;
ResourceTracking.logAllocationSummary('setup');
await ResourceTracking.logDiskUsageSnapshot('setup');
if (CloudRunner.buildParameters.githubCheckId === ``) {
CloudRunner.buildParameters.githubCheckId = await GitHub.createGitHubCheck(CloudRunner.buildParameters.buildGuid);
}

View File

@@ -287,6 +287,10 @@ class CloudRunnerOptions {
return CloudRunnerOptions.getInput('asyncCloudRunner') === 'true';
}
public static get resourceTracking(): boolean {
return CloudRunnerOptions.getInput('resourceTracking') === 'true';
}
public static get useLargePackages(): boolean {
return CloudRunnerOptions.getInput(`useLargePackages`) === `true`;
}

View File

@@ -15,6 +15,7 @@ import { ProviderWorkflow } from '../provider-workflow';
import { TaskService } from './services/task-service';
import CloudRunnerOptions from '../../options/cloud-runner-options';
import { AwsClientFactory } from './aws-client-factory';
import ResourceTracking from '../../services/core/resource-tracking';
class AWSBuildEnvironment implements ProviderInterface {
private baseStackName: string;
@@ -90,6 +91,8 @@ class AWSBuildEnvironment implements ProviderInterface {
secrets: CloudRunnerSecret[],
): Promise<string> {
process.env.AWS_REGION = Input.region;
ResourceTracking.logAllocationSummary('aws workflow');
await ResourceTracking.logDiskUsageSnapshot('aws workflow (host)');
AwsClientFactory.getECS();
const CF = AwsClientFactory.getCloudFormation();
AwsClientFactory.getKinesis();

View File

@@ -17,6 +17,7 @@ import { ProviderWorkflow } from '../provider-workflow';
import { RemoteClientLogger } from '../../remote-client/remote-client-logger';
import { KubernetesRole } from './kubernetes-role';
import { CloudRunnerSystem } from '../../services/core/cloud-runner-system';
import ResourceTracking from '../../services/core/resource-tracking';
class Kubernetes implements ProviderInterface {
public static Instance: Kubernetes;
@@ -137,6 +138,9 @@ class Kubernetes implements ProviderInterface {
): Promise<string> {
try {
CloudRunnerLogger.log('Cloud Runner K8s workflow!');
ResourceTracking.logAllocationSummary('k8s workflow');
await ResourceTracking.logDiskUsageSnapshot('k8s workflow (host)');
await ResourceTracking.logK3dNodeDiskUsage('k8s workflow (before job)');
// Setup
const id =

View File

@@ -14,11 +14,13 @@ import GitHub from '../../github';
import BuildParameters from '../../build-parameters';
import { Cli } from '../../cli/cli';
import CloudRunnerOptions from '../options/cloud-runner-options';
import ResourceTracking from '../services/core/resource-tracking';
export class RemoteClient {
@CliFunction(`remote-cli-pre-build`, `sets up a repository, usually before a game-ci build`)
static async setupRemoteClient() {
CloudRunnerLogger.log(`bootstrap game ci cloud runner...`);
await ResourceTracking.logDiskUsageSnapshot('remote-cli-pre-build (start)');
if (!(await RemoteClient.handleRetainedWorkspace())) {
await RemoteClient.bootstrapRepository();
}
@@ -206,6 +208,7 @@ export class RemoteClient {
// that read from the log file rather than stdout
RemoteClientLogger.log(successMessage);
CloudRunnerLogger.log(successMessage);
await ResourceTracking.logDiskUsageSnapshot('remote-cli-post-build (end)');
return new Promise((result) => result(``));
}

View File

@@ -0,0 +1,84 @@
import CloudRunnerLogger from './cloud-runner-logger';
import CloudRunnerOptions from '../../options/cloud-runner-options';
import CloudRunner from '../../cloud-runner';
import { CloudRunnerSystem } from './cloud-runner-system';
class ResourceTracking {
static isEnabled(): boolean {
return (
CloudRunnerOptions.resourceTracking ||
CloudRunnerOptions.cloudRunnerDebug ||
process.env['cloudRunnerTests'] === 'true'
);
}
static logAllocationSummary(context: string) {
if (!ResourceTracking.isEnabled()) {
return;
}
const buildParameters = CloudRunner.buildParameters;
const allocations = {
providerStrategy: buildParameters.providerStrategy,
containerCpu: buildParameters.containerCpu,
containerMemory: buildParameters.containerMemory,
dockerCpuLimit: buildParameters.dockerCpuLimit,
dockerMemoryLimit: buildParameters.dockerMemoryLimit,
kubeVolumeSize: buildParameters.kubeVolumeSize,
kubeStorageClass: buildParameters.kubeStorageClass,
kubeVolume: buildParameters.kubeVolume,
containerNamespace: buildParameters.containerNamespace,
storageProvider: buildParameters.storageProvider,
rcloneRemote: buildParameters.rcloneRemote,
dockerWorkspacePath: buildParameters.dockerWorkspacePath,
cacheKey: buildParameters.cacheKey,
maxRetainedWorkspaces: buildParameters.maxRetainedWorkspaces,
useCompressionStrategy: buildParameters.useCompressionStrategy,
useLargePackages: buildParameters.useLargePackages,
ephemeralStorageRequest: process.env['cloudRunnerTests'] === 'true' ? 'not set' : '2Gi',
};
CloudRunnerLogger.log(`[ResourceTracking] Allocation summary (${context}):`);
CloudRunnerLogger.log(JSON.stringify(allocations, undefined, 2));
}
static async logDiskUsageSnapshot(context: string) {
if (!ResourceTracking.isEnabled()) {
return;
}
CloudRunnerLogger.log(`[ResourceTracking] Disk usage snapshot (${context})`);
await ResourceTracking.runAndLog('df -h', 'df -h');
await ResourceTracking.runAndLog('du -sh .', 'du -sh .');
await ResourceTracking.runAndLog('du -sh ./cloud-runner-cache', 'du -sh ./cloud-runner-cache');
await ResourceTracking.runAndLog('du -sh ./temp', 'du -sh ./temp');
await ResourceTracking.runAndLog('du -sh ./logs', 'du -sh ./logs');
}
static async logK3dNodeDiskUsage(context: string) {
if (!ResourceTracking.isEnabled()) {
return;
}
const nodes = ['k3d-unity-builder-agent-0', 'k3d-unity-builder-server-0'];
CloudRunnerLogger.log(`[ResourceTracking] K3d node disk usage (${context})`);
for (const node of nodes) {
await ResourceTracking.runAndLog(
`k3d node ${node}`,
`docker exec ${node} sh -c "df -h /var/lib/rancher/k3s 2>/dev/null || df -h / 2>/dev/null || true" || true`,
);
}
}
private static async runAndLog(label: string, command: string) {
try {
const output = await CloudRunnerSystem.Run(command, true, true);
const trimmed = output.trim();
CloudRunnerLogger.log(`[ResourceTracking] ${label}:\n${trimmed || 'no output'}`);
} catch (error: any) {
CloudRunnerLogger.log(`[ResourceTracking] ${label} failed: ${error?.message || error}`);
}
}
}
export default ResourceTracking;