fix: harden Windows Parallels smoke install and onboarding

This commit is contained in:
Peter Steinberger 2026-04-05 15:56:52 +01:00
parent b723b30def
commit 5eb551ccfa
No known key found for this signature in database
6 changed files with 311 additions and 27 deletions

View File

@ -68,7 +68,9 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
- Windows installer/tgz phases now retry once after guest-ready recheck; keep new Windows smoke steps idempotent so a transport-flake retry is safe.
- If a Windows retry sees the VM become `suspended` or `stopped`, resume/start it before the next `prlctl exec`; otherwise the second attempt just repeats the same `rc=255`.
- Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips.
- Fresh Windows tgz install phases should also use the background PowerShell runner plus done-file/log-drain pattern; do not rely on one long-lived `prlctl exec ... powershell ... npm install -g` transport for package installs.
- Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes.
- Fresh Windows daemon-health reachability should use a hello-only gateway probe and a longer per-probe timeout than the default local attach path; full health RPCs are too eager during initial startup on current main.
- Fresh Windows ref-mode agent verification should set `OPENAI_API_KEY` in the PowerShell environment before invoking `openclaw.cmd agent`, for the same pairing-required fallback reason as macOS.
- The standalone Windows upgrade smoke lane should stop the managed gateway after `upgrade.install-main` and before `upgrade.onboard-ref`. Restarting before onboard can leave the old process alive on the pre-onboard token while onboard rewrites `~/.openclaw/openclaw.json`, which then fails `gateway-health` with `unauthorized: gateway token mismatch`.
- If standalone Windows upgrade fails with a gateway token mismatch but `pnpm test:parallels:npm-update` passes, trust the mismatch as a standalone ref-onboard ordering bug first; the npm-update helper does not re-run ref-mode onboard on the same guest.

View File

@ -28,6 +28,7 @@ MAIN_TGZ_DIR="$(mktemp -d)"
MAIN_TGZ_PATH=""
MINGIT_ZIP_PATH=""
MINGIT_ZIP_NAME=""
WINDOWS_INSTALL_SCRIPT_PATH=""
WINDOWS_ONBOARD_SCRIPT_PATH=""
SERVER_PID=""
RUN_DIR="$(mktemp -d /tmp/openclaw-parallels-windows.XXXXXX)"
@ -749,17 +750,31 @@ ensure_current_build() {
ensure_guest_git() {
local host_ip="$1"
local mingit_url
local mingit_url mingit_url_q mingit_name_q
mingit_url="http://$host_ip:$HOST_PORT/$MINGIT_ZIP_NAME"
if guest_exec cmd.exe /d /s /c "where git.exe >nul 2>nul && git.exe --version"; then
return
fi
guest_exec cmd.exe /d /s /c "if exist \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\" rmdir /s /q \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
guest_exec cmd.exe /d /s /c "if not exist \"%LOCALAPPDATA%\\OpenClaw\\deps\" mkdir \"%LOCALAPPDATA%\\OpenClaw\\deps\""
guest_exec cmd.exe /d /s /c "mkdir \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
guest_exec cmd.exe /d /s /c "curl.exe -fsSL \"$mingit_url\" -o \"%TEMP%\\$MINGIT_ZIP_NAME\""
guest_exec cmd.exe /d /s /c "tar.exe -xf \"%TEMP%\\$MINGIT_ZIP_NAME\" -C \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
guest_exec cmd.exe /d /s /c "del /q \"%TEMP%\\$MINGIT_ZIP_NAME\" & set \"PATH=%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\cmd;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\mingw64\\bin;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\usr\\bin;%PATH%\" && git.exe --version"
mingit_url_q="$(ps_single_quote "$mingit_url")"
mingit_name_q="$(ps_single_quote "$MINGIT_ZIP_NAME")"
guest_powershell "$(cat <<EOF
\$depsRoot = Join-Path \$env:LOCALAPPDATA 'OpenClaw\deps'
\$portableGit = Join-Path \$depsRoot 'portable-git'
\$archive = Join-Path \$env:TEMP '${mingit_name_q}'
if (Test-Path \$portableGit) {
Remove-Item \$portableGit -Recurse -Force
}
New-Item -ItemType Directory -Force -Path \$portableGit | Out-Null
if (-not (Test-Path \$portableGit)) {
throw 'portable git directory missing after create'
}
curl.exe -fsSL '${mingit_url_q}' -o \$archive
tar.exe -xf \$archive -C \$portableGit
Remove-Item \$archive -Force -ErrorAction SilentlyContinue
\$env:PATH = "\$portableGit\cmd;\$portableGit\mingw64\bin;\$portableGit\usr\bin;\$env:PATH"
git.exe --version
EOF
)"
}
pack_main_tgz() {
@ -869,13 +884,200 @@ EOF
install_main_tgz() {
local host_ip="$1"
local temp_name="$2"
local tgz_url
local tgz_url script_url
local runner_name log_name done_name done_status launcher_state guest_log
local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
local log_state_path
tgz_url="http://$host_ip:$HOST_PORT/$(basename "$MAIN_TGZ_PATH")"
# Global npm installs on the Windows guest can stay silent for long stretches.
# Treat the phase log plus retry wrapper as the primary signal before assuming
# the guest hung.
run_windows_retry "main tgz install" 2 \
guest_exec cmd.exe /d /s /c "set \"PATH=%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\cmd;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\mingw64\\bin;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\usr\\bin;%PATH%\" && curl.exe -fsSL \"$tgz_url\" -o \"%TEMP%\\$temp_name\" && npm.cmd install -g \"%TEMP%\\$temp_name\" --no-fund --no-audit && \"%APPDATA%\\npm\\openclaw.cmd\" --version"
write_install_runner_script
script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_INSTALL_SCRIPT_PATH")"
runner_name="openclaw-install-$RANDOM-$RANDOM.ps1"
log_name="openclaw-install-$RANDOM-$RANDOM.log"
done_name="openclaw-install-$RANDOM-$RANDOM.done"
log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-log-state.XXXXXX")"
: >"$log_state_path"
start_seconds="$SECONDS"
poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
startup_checked=0
guest_powershell "$(cat <<EOF
\$runner = Join-Path \$env:TEMP '$runner_name'
\$log = Join-Path \$env:TEMP '$log_name'
\$done = Join-Path \$env:TEMP '$done_name'
Remove-Item \$runner, \$log, \$done -Force -ErrorAction SilentlyContinue
curl.exe -fsSL '$script_url' -o \$runner
Start-Process powershell.exe -ArgumentList @(
'-NoProfile',
'-ExecutionPolicy', 'Bypass',
'-File', \$runner,
'-TgzUrl', '$tgz_url',
'-TempName', '$temp_name',
'-LogPath', \$log,
'-DonePath', \$done
) -WindowStyle Hidden | Out-Null
EOF
)"
stream_windows_install_log() {
set +e
guest_log="$(
guest_powershell_poll 20 "\$log = Join-Path \$env:TEMP '$log_name'; if (Test-Path \$log) { Get-Content \$log }"
)"
log_rc=$?
set -e
if [[ $log_rc -ne 0 ]] || [[ -z "$guest_log" ]]; then
return "$log_rc"
fi
GUEST_LOG="$guest_log" python3 - "$log_state_path" <<'PY'
import os
import pathlib
import sys
state_path = pathlib.Path(sys.argv[1])
previous = state_path.read_text(encoding="utf-8", errors="replace")
current = os.environ["GUEST_LOG"].replace("\r\n", "\n").replace("\r", "\n")
if current.startswith(previous):
sys.stdout.write(current[len(previous):])
else:
sys.stdout.write(current)
state_path.write_text(current, encoding="utf-8")
PY
}
while :; do
set +e
done_status="$(
guest_powershell_poll 20 "\$done = Join-Path \$env:TEMP '$done_name'; if (Test-Path \$done) { (Get-Content \$done -Raw).Trim() }"
)"
poll_rc=$?
set -e
done_status="${done_status//$'\r'/}"
if [[ $poll_rc -ne 0 ]]; then
warn "windows install helper poll failed; retrying"
if (( SECONDS >= poll_deadline )); then
warn "windows install helper timed out while polling done file"
rm -f "$log_state_path"
return 1
fi
sleep 2
continue
fi
set +e
stream_windows_install_log
log_rc=$?
set -e
if [[ $log_rc -ne 0 ]]; then
warn "windows install helper live log poll failed; retrying"
fi
if [[ -n "$done_status" ]]; then
if ! stream_windows_install_log; then
warn "windows install helper log drain failed after completion"
fi
rm -f "$log_state_path"
[[ "$done_status" == "0" ]]
return $?
fi
if [[ "$startup_checked" -eq 0 && $((SECONDS - start_seconds)) -ge 20 ]]; then
set +e
launcher_state="$(
guest_powershell_poll 20 "\$runner = Join-Path \$env:TEMP '$runner_name'; \$log = Join-Path \$env:TEMP '$log_name'; \$done = Join-Path \$env:TEMP '$done_name'; 'runner=' + (Test-Path \$runner) + ' log=' + (Test-Path \$log) + ' done=' + (Test-Path \$done)"
)"
state_rc=$?
set -e
launcher_state="${launcher_state//$'\r'/}"
startup_checked=1
if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
warn "windows install helper failed to materialize guest files"
rm -f "$log_state_path"
return 1
fi
fi
if (( SECONDS >= poll_deadline )); then
if ! stream_windows_install_log; then
warn "windows install helper log drain failed after timeout"
fi
warn "windows install helper timed out waiting for done file"
rm -f "$log_state_path"
return 1
fi
sleep 2
done
}
write_install_runner_script() {
WINDOWS_INSTALL_SCRIPT_PATH="$MAIN_TGZ_DIR/openclaw-install-main.ps1"
cat >"$WINDOWS_INSTALL_SCRIPT_PATH" <<'EOF'
param(
[Parameter(Mandatory = $true)][string]$TgzUrl,
[Parameter(Mandatory = $true)][string]$TempName,
[Parameter(Mandatory = $true)][string]$LogPath,
[Parameter(Mandatory = $true)][string]$DonePath
)
$ErrorActionPreference = 'Stop'
$PSNativeCommandUseErrorActionPreference = $false
function Write-ProgressLog {
param([Parameter(Mandatory = $true)][string]$Stage)
"==> $Stage" | Tee-Object -FilePath $LogPath -Append | Out-Null
}
function Invoke-Logged {
param(
[Parameter(Mandatory = $true)][string]$Label,
[Parameter(Mandatory = $true)][scriptblock]$Command
)
$output = $null
$previousErrorActionPreference = $ErrorActionPreference
$previousNativeErrorPreference = $PSNativeCommandUseErrorActionPreference
try {
$ErrorActionPreference = 'Continue'
$PSNativeCommandUseErrorActionPreference = $false
$output = & $Command *>&1
$exitCode = $LASTEXITCODE
} finally {
$ErrorActionPreference = $previousErrorActionPreference
$PSNativeCommandUseErrorActionPreference = $previousNativeErrorPreference
}
if ($null -ne $output) {
$output | Tee-Object -FilePath $LogPath -Append | Out-Null
}
if ($exitCode -ne 0) {
throw "$Label failed with exit code $exitCode"
}
}
try {
$env:PATH = "$env:LOCALAPPDATA\OpenClaw\deps\portable-git\cmd;$env:LOCALAPPDATA\OpenClaw\deps\portable-git\mingw64\bin;$env:LOCALAPPDATA\OpenClaw\deps\portable-git\usr\bin;$env:PATH"
$tgz = Join-Path $env:TEMP $TempName
Remove-Item $tgz, $LogPath, $DonePath -Force -ErrorAction SilentlyContinue
Write-ProgressLog 'install.start'
Write-ProgressLog 'install.download-tgz'
Invoke-Logged 'download current tgz' { curl.exe -fsSL $TgzUrl -o $tgz }
Write-ProgressLog 'install.install-tgz'
Invoke-Logged 'npm install current tgz' { npm.cmd install -g $tgz --no-fund --no-audit }
$openclaw = Join-Path $env:APPDATA 'npm\openclaw.cmd'
Write-ProgressLog 'install.verify-version'
Invoke-Logged 'openclaw --version' { & $openclaw --version }
Write-ProgressLog 'install.done'
Set-Content -Path $DonePath -Value ([string]0)
exit 0
} catch {
if (Test-Path $LogPath) {
Add-Content -Path $LogPath -Value ($_ | Out-String)
} else {
($_ | Out-String) | Set-Content -Path $LogPath
}
Set-Content -Path $DonePath -Value '1'
exit 1
}
EOF
}
verify_version_contains() {

View File

@ -3,6 +3,7 @@ import { afterEach, describe, expect, it, vi } from "vitest";
import {
normalizeGatewayTokenInput,
openUrl,
probeGatewayReachable,
resolveBrowserOpenCommand,
resolveControlUiLinks,
validateGatewayPasswordInput,
@ -22,6 +23,7 @@ const mocks = vi.hoisted(() => ({
killed: false,
})),
pickPrimaryTailnetIPv4: vi.fn<() => string | undefined>(() => undefined),
probeGateway: vi.fn(),
}));
vi.mock("../process/exec.js", () => ({
@ -32,6 +34,10 @@ vi.mock("../infra/tailnet.js", () => ({
pickPrimaryTailnetIPv4: mocks.pickPrimaryTailnetIPv4,
}));
vi.mock("../gateway/probe.js", () => ({
probeGateway: mocks.probeGateway,
}));
afterEach(() => {
vi.restoreAllMocks();
vi.unstubAllEnvs();
@ -74,6 +80,62 @@ describe("resolveBrowserOpenCommand", () => {
});
});
describe("probeGatewayReachable", () => {
it("uses a hello-only probe for onboarding reachability", async () => {
mocks.probeGateway.mockResolvedValueOnce({
ok: true,
url: "ws://127.0.0.1:18789",
connectLatencyMs: 42,
error: null,
close: null,
health: null,
status: null,
presence: null,
configSnapshot: null,
});
const result = await probeGatewayReachable({
url: "ws://127.0.0.1:18789",
token: "tok_test",
timeoutMs: 2500,
});
expect(result).toEqual({ ok: true });
expect(mocks.probeGateway).toHaveBeenCalledWith({
url: "ws://127.0.0.1:18789",
timeoutMs: 2500,
auth: {
token: "tok_test",
password: undefined,
},
detailLevel: "none",
});
});
it("returns the probe error detail on failure", async () => {
mocks.probeGateway.mockResolvedValueOnce({
ok: false,
url: "ws://127.0.0.1:18789",
connectLatencyMs: null,
error: "connect failed: timeout",
close: null,
health: null,
status: null,
presence: null,
configSnapshot: null,
});
const result = await probeGatewayReachable({
url: "ws://127.0.0.1:18789",
});
expect(result).toEqual({
ok: false,
detail: "connect failed: timeout",
});
});
});
describe("resolveControlUiLinks", () => {
it("uses customBindHost for custom bind", () => {
const links = resolveControlUiLinks({

View File

@ -8,9 +8,9 @@ import type { OpenClawConfig } from "../config/config.js";
import { CONFIG_PATH } from "../config/config.js";
import { resolveAgentModelPrimaryValue } from "../config/model-input.js";
import { resolveSessionTranscriptsDirForAgent } from "../config/sessions.js";
import { callGateway } from "../gateway/call.js";
import { resolveControlUiLinks } from "../gateway/control-ui-links.js";
import { normalizeControlUiBasePath } from "../gateway/control-ui-shared.js";
import { probeGateway } from "../gateway/probe.js";
import {
detectBrowserOpenSupport,
openUrl,
@ -22,7 +22,6 @@ import { runCommandWithTimeout } from "../process/exec.js";
import type { RuntimeEnv } from "../runtime.js";
import { stylePromptTitle } from "../terminal/prompt-style.js";
import { CONFIG_DIR, shortenHomeInString, shortenHomePath, sleep } from "../utils.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
import { VERSION } from "../version.js";
import type { NodeManagerChoice, OnboardMode, ResetScope } from "./onboard-types.js";
@ -228,16 +227,16 @@ export async function probeGatewayReachable(params: {
const url = params.url.trim();
const timeoutMs = params.timeoutMs ?? 1500;
try {
await callGateway({
const probe = await probeGateway({
url,
token: params.token,
password: params.password,
method: "health",
timeoutMs,
clientName: GATEWAY_CLIENT_NAMES.PROBE,
mode: GATEWAY_CLIENT_MODES.PROBE,
auth: {
token: params.token,
password: params.password,
},
detailLevel: "none",
});
return { ok: true };
return probe.ok ? { ok: true } : { ok: false, detail: probe.error ?? undefined };
} catch (err) {
return { ok: false, detail: summarizeError(err) };
}

View File

@ -33,7 +33,13 @@ const readLastGatewayErrorLineMock = vi.hoisted(() =>
vi.fn(async () => "Gateway failed to start: required secrets are unavailable."),
);
let waitForGatewayReachableMock:
| ((params: { url: string; token?: string; password?: string; deadlineMs?: number }) => Promise<{
| ((params: {
url: string;
token?: string;
password?: string;
deadlineMs?: number;
probeTimeoutMs?: number;
}) => Promise<{
ok: boolean;
detail?: string;
}>)
@ -479,10 +485,20 @@ describe("onboard (non-interactive): gateway and remote auth", () => {
it("uses a longer health deadline when daemon install was requested", async () => {
await withStateDir("state-local-daemon-health-", async (stateDir) => {
let capturedDeadlineMs: number | undefined;
waitForGatewayReachableMock = vi.fn(async (params: { deadlineMs?: number }) => {
capturedDeadlineMs = params.deadlineMs;
return { ok: true };
});
let capturedProbeTimeoutMs: number | undefined;
waitForGatewayReachableMock = vi.fn(
async (params: {
url: string;
token?: string;
password?: string;
deadlineMs?: number;
probeTimeoutMs?: number;
}) => {
capturedDeadlineMs = params.deadlineMs;
capturedProbeTimeoutMs = params.probeTimeoutMs;
return { ok: true };
},
);
await runNonInteractiveSetup(
{
@ -500,6 +516,7 @@ describe("onboard (non-interactive): gateway and remote auth", () => {
expect(installGatewayDaemonNonInteractiveMock).toHaveBeenCalledTimes(1);
expect(capturedDeadlineMs).toBe(45_000);
expect(capturedProbeTimeoutMs).toBe(10_000);
});
}, 60_000);

View File

@ -25,6 +25,7 @@ import { resolveNonInteractiveWorkspaceDir } from "./local/workspace.js";
const INSTALL_DAEMON_HEALTH_DEADLINE_MS = 45_000;
const ATTACH_EXISTING_GATEWAY_HEALTH_DEADLINE_MS = 15_000;
const INSTALL_DAEMON_HEALTH_PROBE_TIMEOUT_MS = 10_000;
async function collectGatewayHealthFailureDiagnostics(): Promise<
GatewayHealthFailureDiagnostics | undefined
@ -211,6 +212,7 @@ export async function runNonInteractiveLocalSetup(params: {
deadlineMs: opts.installDaemon
? INSTALL_DAEMON_HEALTH_DEADLINE_MS
: ATTACH_EXISTING_GATEWAY_HEALTH_DEADLINE_MS,
probeTimeoutMs: opts.installDaemon ? INSTALL_DAEMON_HEALTH_PROBE_TIMEOUT_MS : undefined,
});
if (!probe.ok) {
const diagnostics = opts.installDaemon