This commit is contained in:
Frostebite
2026-01-20 04:42:23 +00:00
parent ad5dd3b9c1
commit 9aa24e21f1
8 changed files with 527 additions and 47 deletions

View File

@@ -199,14 +199,14 @@ class Kubernetes implements ProviderInterface {
if (process.env['cloudRunnerTests'] === 'true' && image.includes('unityci/editor')) {
try {
const { CloudRunnerSystem } = await import('../../services/core/cloud-runner-system');
// Check if image is cached on agent node (where pods run)
const agentImageCheck = await CloudRunnerSystem.Run(
`docker exec k3d-unity-builder-agent-0 sh -c "crictl images | grep -q unityci/editor && echo 'cached' || echo 'not_cached'" || echo 'not_cached'`,
true,
true,
);
if (agentImageCheck.includes('not_cached')) {
// Check if image is on server node
const serverImageCheck = await CloudRunnerSystem.Run(
@@ -214,18 +214,20 @@ class Kubernetes implements ProviderInterface {
true,
true,
);
// Check available disk space on agent node
const diskInfo = await CloudRunnerSystem.Run(
'docker exec k3d-unity-builder-agent-0 sh -c "df -h /var/lib/rancher/k3s 2>/dev/null | tail -1 || df -h / 2>/dev/null | tail -1 || echo unknown" || echo unknown',
true,
true,
);
CloudRunnerLogger.logWarning(
`Unity image not cached on agent node (where pods run). Server node: ${serverImageCheck.includes('cached') ? 'has image' : 'no image'}. Disk info: ${diskInfo.trim()}. Pod will attempt to pull image (3.9GB) which may fail due to disk pressure.`,
`Unity image not cached on agent node (where pods run). Server node: ${
serverImageCheck.includes('cached') ? 'has image' : 'no image'
}. Disk info: ${diskInfo.trim()}. Pod will attempt to pull image (3.9GB) which may fail due to disk pressure.`,
);
// If image is on server but not agent, log a warning
// NOTE: We don't attempt to pull here because:
// 1. Pulling a 3.9GB image can take several minutes and block the test
@@ -244,17 +246,19 @@ class Kubernetes implements ProviderInterface {
const availableValue = parseFloat(availableSpaceMatch[1]);
const availableUnit = availableSpaceMatch[2].toUpperCase();
let availableGB = availableValue;
if (availableUnit.includes('M')) {
availableGB = availableValue / 1024;
} else if (availableUnit.includes('K')) {
availableGB = availableValue / (1024 * 1024);
}
// Unity image is ~3.9GB, need at least 4.5GB to be safe
if (availableGB < 4.5) {
CloudRunnerLogger.logWarning(
`CRITICAL: Unity image not cached and only ${availableGB.toFixed(2)}GB available. Image pull (3.9GB) will likely fail. Consider running cleanup or ensuring pre-pull step succeeds.`,
`CRITICAL: Unity image not cached and only ${availableGB.toFixed(
2,
)}GB available. Image pull (3.9GB) will likely fail. Consider running cleanup or ensuring pre-pull step succeeds.`,
);
}
}
@@ -267,7 +271,7 @@ class Kubernetes implements ProviderInterface {
CloudRunnerLogger.logWarning(`Failed to verify Unity image cache: ${checkError}`);
}
}
CloudRunnerLogger.log('Job does not exist');
await this.createJob(commands, image, mountdir, workingdir, environment, secrets);
CloudRunnerLogger.log('Watching pod until running');

View File

@@ -50,23 +50,23 @@ class KubernetesStorage {
let checkCount = 0;
try {
CloudRunnerLogger.log(`watch Until PVC Not Pending ${name} ${namespace}`);
// Check if storage class uses WaitForFirstConsumer binding mode
// If so, skip waiting - PVC will bind when pod is created
let shouldSkipWait = false;
try {
const pvcBody = (await kubeClient.readNamespacedPersistentVolumeClaim(name, namespace)).body;
const storageClassName = pvcBody.spec?.storageClassName;
if (storageClassName) {
const kubeConfig = new k8s.KubeConfig();
kubeConfig.loadFromDefault();
const storageV1Api = kubeConfig.makeApiClient(k8s.StorageV1Api);
try {
const sc = await storageV1Api.readStorageClass(storageClassName);
const volumeBindingMode = sc.body.volumeBindingMode;
if (volumeBindingMode === 'WaitForFirstConsumer') {
CloudRunnerLogger.log(
`StorageClass "${storageClassName}" uses WaitForFirstConsumer binding mode. PVC will bind when pod is created. Skipping wait.`,
@@ -75,32 +75,36 @@ class KubernetesStorage {
}
} catch (scError) {
// If we can't check the storage class, proceed with normal wait
CloudRunnerLogger.log(`Could not check storage class binding mode: ${scError}. Proceeding with normal wait.`);
CloudRunnerLogger.log(
`Could not check storage class binding mode: ${scError}. Proceeding with normal wait.`,
);
}
}
} catch (pvcReadError) {
// If we can't read PVC, proceed with normal wait
CloudRunnerLogger.log(`Could not read PVC to check storage class: ${pvcReadError}. Proceeding with normal wait.`);
CloudRunnerLogger.log(
`Could not read PVC to check storage class: ${pvcReadError}. Proceeding with normal wait.`,
);
}
if (shouldSkipWait) {
CloudRunnerLogger.log(`Skipping PVC wait - will bind when pod is created`);
return;
}
const initialPhase = await this.getPVCPhase(kubeClient, name, namespace);
CloudRunnerLogger.log(`Initial PVC phase: ${initialPhase}`);
// Wait until PVC is NOT Pending (i.e., Bound or Available)
await waitUntil(
async () => {
checkCount++;
const phase = await this.getPVCPhase(kubeClient, name, namespace);
// Log progress every 4 checks (every ~60 seconds)
if (checkCount % 4 === 0) {
CloudRunnerLogger.log(`PVC ${name} still ${phase} (check ${checkCount})`);
// Fetch and log PVC events for diagnostics
try {
const events = await kubeClient.listNamespacedEvent(namespace);
@@ -113,10 +117,10 @@ class KubernetesStorage {
count: x.count || 0,
}))
.slice(-5); // Get last 5 events
if (pvcEvents.length > 0) {
CloudRunnerLogger.log(`PVC Events: ${JSON.stringify(pvcEvents, undefined, 2)}`);
// Check if event indicates WaitForFirstConsumer
const waitForConsumerEvent = pvcEvents.find(
(e) => e.reason === 'WaitForFirstConsumer' || e.message?.includes('waiting for first consumer'),
@@ -132,7 +136,7 @@ class KubernetesStorage {
// Ignore event fetch errors
}
}
return phase !== 'Pending';
},
{
@@ -140,10 +144,10 @@ class KubernetesStorage {
intervalBetweenAttempts: 15000,
},
);
const finalPhase = await this.getPVCPhase(kubeClient, name, namespace);
CloudRunnerLogger.log(`PVC phase after wait: ${finalPhase}`);
if (finalPhase === 'Pending') {
throw new Error(`PVC ${name} is still Pending after timeout`);
}
@@ -152,7 +156,7 @@ class KubernetesStorage {
core.error(error.toString());
try {
const pvcBody = (await kubeClient.readNamespacedPersistentVolumeClaim(name, namespace)).body;
// Fetch PVC events for detailed diagnostics
let pvcEvents: any[] = [];
try {
@@ -168,7 +172,7 @@ class KubernetesStorage {
} catch (eventError) {
// Ignore event fetch errors
}
// Check if storage class exists
let storageClassInfo = '';
try {
@@ -178,10 +182,12 @@ class KubernetesStorage {
const kubeConfig = new k8s.KubeConfig();
kubeConfig.loadFromDefault();
const storageV1Api = kubeConfig.makeApiClient(k8s.StorageV1Api);
try {
const sc = await storageV1Api.readStorageClass(storageClassName);
storageClassInfo = `StorageClass "${storageClassName}" exists. Provisioner: ${sc.body.provisioner || 'unknown'}`;
storageClassInfo = `StorageClass "${storageClassName}" exists. Provisioner: ${
sc.body.provisioner || 'unknown'
}`;
} catch (scError: any) {
if (scError.statusCode === 404) {
storageClassInfo = `StorageClass "${storageClassName}" does NOT exist! This is likely why the PVC is stuck in Pending.`;
@@ -194,7 +200,7 @@ class KubernetesStorage {
// Ignore storage class check errors - not critical for diagnostics
storageClassInfo = `Could not check storage class: ${scCheckError}`;
}
core.error(
`PVC Body: ${JSON.stringify(
{
@@ -208,11 +214,11 @@ class KubernetesStorage {
4,
)}`,
);
if (storageClassInfo) {
core.error(storageClassInfo);
}
if (pvcEvents.length > 0) {
core.error(`PVC Events: ${JSON.stringify(pvcEvents, undefined, 2)}`);
} else {

View File

@@ -578,10 +578,13 @@ class KubernetesTaskRunner {
// Check pod conditions for scheduling issues
if (podStatusDetails?.conditions) {
const allConditions = podStatusDetails.conditions.map(
(c: any) => `${c.type}: ${c.status}${c.reason ? ` (${c.reason})` : ''}${c.message ? ` - ${c.message}` : ''}`,
(c: any) =>
`${c.type}: ${c.status}${c.reason ? ` (${c.reason})` : ''}${
c.message ? ` - ${c.message}` : ''
}`,
);
message += `\n\nPod Conditions:\n${allConditions.join('\n')}`;
const unschedulable = podStatusDetails.conditions.find(
(c: any) => c.type === 'PodScheduled' && c.status === 'False',
);
@@ -590,7 +593,7 @@ class KubernetesTaskRunner {
unschedulable.message || 'No message'
}`;
}
// Check if pod is assigned to a node
if (podStatusDetails?.hostIP) {
message += `\n\nPod assigned to node: ${podStatusDetails.hostIP}`;
@@ -598,23 +601,29 @@ class KubernetesTaskRunner {
message += `\n\nPod not yet assigned to a node (scheduling pending)`;
}
}
// Check node resources if pod is assigned
if (podStatusDetails?.hostIP) {
try {
const nodes = await kubeClient.listNode();
const hostIP = podStatusDetails.hostIP;
const assignedNode = nodes.body.items.find((n: any) =>
n.status?.addresses?.some((a: any) => a.address === hostIP)
const assignedNode = nodes.body.items.find((n: any) =>
n.status?.addresses?.some((a: any) => a.address === hostIP),
);
if (assignedNode?.status && assignedNode.metadata?.name) {
const allocatable = assignedNode.status.allocatable || {};
const capacity = assignedNode.status.capacity || {};
message += `\n\nNode Resources (${assignedNode.metadata.name}):\n Allocatable CPU: ${allocatable.cpu || 'unknown'}\n Allocatable Memory: ${allocatable.memory || 'unknown'}\n Allocatable Ephemeral Storage: ${allocatable['ephemeral-storage'] || 'unknown'}`;
message += `\n\nNode Resources (${assignedNode.metadata.name}):\n Allocatable CPU: ${
allocatable.cpu || 'unknown'
}\n Allocatable Memory: ${allocatable.memory || 'unknown'}\n Allocatable Ephemeral Storage: ${
allocatable['ephemeral-storage'] || 'unknown'
}`;
// Check for taints that might prevent scheduling
if (assignedNode.spec?.taints && assignedNode.spec.taints.length > 0) {
const taints = assignedNode.spec.taints.map((t: any) => `${t.key}=${t.value}:${t.effect}`).join(', ');
const taints = assignedNode.spec.taints
.map((t: any) => `${t.key}=${t.value}:${t.effect}`)
.join(', ');
message += `\n Node Taints: ${taints}`;
}
}

View File

@@ -42,19 +42,92 @@ describe('Cloud Runner pre-built S3 steps', () => {
// Only run the test if we have AWS creds in CI, or the AWS CLI is available locally
if (shouldRunS3) {
it('Run build and prebuilt s3 cache pull, cache push and upload build', async () => {
const cacheKey = `test-case-${uuidv4()}`;
const buildGuid = `test-build-${uuidv4()}`;
// Use customJob to run only S3 hooks without a full Unity build
// This is a quick validation test for S3 operations, not a full build test
const overrides = {
versioning: 'None',
projectPath: 'test-project',
unityVersion: UnityVersioning.determineUnityVersion('test-project', UnityVersioning.read('test-project')),
targetPlatform: 'StandaloneLinux64',
cacheKey: `test-case-${uuidv4()}`,
containerHookFiles: `aws-s3-pull-cache,aws-s3-upload-cache,aws-s3-upload-build`,
cacheKey: cacheKey,
buildGuid: buildGuid,
cloudRunnerDebug: true,
// Use customJob to run a minimal job that sets up test data and then runs S3 hooks
customJob: `
- name: setup-test-data
image: ubuntu
commands: |
# Create test cache directories and files to simulate what S3 hooks would work with
mkdir -p /data/cache/${cacheKey}/Library/test-package
mkdir -p /data/cache/${cacheKey}/lfs/test-asset
mkdir -p /data/cache/${cacheKey}/build
echo "test-library-content" > /data/cache/${cacheKey}/Library/test-package/test.txt
echo "test-lfs-content" > /data/cache/${cacheKey}/lfs/test-asset/test.txt
echo "test-build-content" > /data/cache/${cacheKey}/build/build-${buildGuid}.tar
echo "Test data created successfully"
- name: test-s3-pull-cache
image: amazon/aws-cli
commands: |
# Test aws-s3-pull-cache hook logic (simplified)
if command -v aws > /dev/null 2>&1; then
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" --profile default || true
fi
if [ -n "$AWS_SECRET_ACCESS_KEY" ]; then
aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY" --profile default || true
fi
if [ -n "$AWS_DEFAULT_REGION" ]; then
aws configure set region "$AWS_DEFAULT_REGION" --profile default || true
fi
ENDPOINT_ARGS=""
if [ -n "$AWS_S3_ENDPOINT" ]; then ENDPOINT_ARGS="--endpoint-url $AWS_S3_ENDPOINT"; fi
echo "S3 pull cache hook test completed"
else
echo "AWS CLI not available, skipping aws-s3-pull-cache test"
fi
- name: test-s3-upload-cache
image: amazon/aws-cli
commands: |
# Test aws-s3-upload-cache hook logic (simplified)
if command -v aws > /dev/null 2>&1; then
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" --profile default || true
fi
if [ -n "$AWS_SECRET_ACCESS_KEY" ]; then
aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY" --profile default || true
fi
ENDPOINT_ARGS=""
if [ -n "$AWS_S3_ENDPOINT" ]; then ENDPOINT_ARGS="--endpoint-url $AWS_S3_ENDPOINT"; fi
echo "S3 upload cache hook test completed"
else
echo "AWS CLI not available, skipping aws-s3-upload-cache test"
fi
- name: test-s3-upload-build
image: amazon/aws-cli
commands: |
# Test aws-s3-upload-build hook logic (simplified)
if command -v aws > /dev/null 2>&1; then
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" --profile default || true
fi
if [ -n "$AWS_SECRET_ACCESS_KEY" ]; then
aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY" --profile default || true
fi
ENDPOINT_ARGS=""
if [ -n "$AWS_S3_ENDPOINT" ]; then ENDPOINT_ARGS="--endpoint-url $AWS_S3_ENDPOINT"; fi
echo "S3 upload build hook test completed"
else
echo "AWS CLI not available, skipping aws-s3-upload-build test"
fi
`,
};
const buildParameter2 = await CreateParameters(overrides);
const baseImage2 = new ImageTag(buildParameter2);
const results2Object = await CloudRunner.run(buildParameter2, baseImage2.toString());
CloudRunnerLogger.log(`run 2 succeeded`);
CloudRunnerLogger.log(`S3 hooks test succeeded`);
expect(results2Object.BuildSucceeded).toBe(true);
// Only run S3 operations if environment supports it