fix: harden Windows Parallels smoke install and onboarding

2026-04-05 15:56:52 +01:00 · 2026-04-05 15:56:52 +01:00 · 5eb551ccfa
parent b723b30def
commit 5eb551ccfa
6 changed files with 311 additions and 27 deletions
--- a/.agents/skills/openclaw-parallels-smoke/SKILL.md
+++ b/.agents/skills/openclaw-parallels-smoke/SKILL.md
@ -68,7 +68,9 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
 - Windows installer/tgz phases now retry once after guest-ready recheck; keep new Windows smoke steps idempotent so a transport-flake retry is safe.
 - If a Windows retry sees the VM become `suspended` or `stopped`, resume/start it before the next `prlctl exec`; otherwise the second attempt just repeats the same `rc=255`.
 - Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips.
+- Fresh Windows tgz install phases should also use the background PowerShell runner plus done-file/log-drain pattern; do not rely on one long-lived `prlctl exec ... powershell ... npm install -g` transport for package installs.
 - Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes.
+- Fresh Windows daemon-health reachability should use a hello-only gateway probe and a longer per-probe timeout than the default local attach path; full health RPCs are too eager during initial startup on current main.
 - Fresh Windows ref-mode agent verification should set `OPENAI_API_KEY` in the PowerShell environment before invoking `openclaw.cmd agent`, for the same pairing-required fallback reason as macOS.
 - The standalone Windows upgrade smoke lane should stop the managed gateway after `upgrade.install-main` and before `upgrade.onboard-ref`. Restarting before onboard can leave the old process alive on the pre-onboard token while onboard rewrites `~/.openclaw/openclaw.json`, which then fails `gateway-health` with `unauthorized: gateway token mismatch`.
 - If standalone Windows upgrade fails with a gateway token mismatch but `pnpm test:parallels:npm-update` passes, trust the mismatch as a standalone ref-onboard ordering bug first; the npm-update helper does not re-run ref-mode onboard on the same guest.
--- a/scripts/e2e/parallels-windows-smoke.sh
+++ b/scripts/e2e/parallels-windows-smoke.sh
@ -28,6 +28,7 @@ MAIN_TGZ_DIR="$(mktemp -d)"
 MAIN_TGZ_PATH=""
 MINGIT_ZIP_PATH=""
 MINGIT_ZIP_NAME=""
+WINDOWS_INSTALL_SCRIPT_PATH=""
 WINDOWS_ONBOARD_SCRIPT_PATH=""
 SERVER_PID=""
 RUN_DIR="$(mktemp -d /tmp/openclaw-parallels-windows.XXXXXX)"
@ -749,17 +750,31 @@ ensure_current_build() {

 ensure_guest_git() {
  local host_ip="$1"
-  local mingit_url
+  local mingit_url mingit_url_q mingit_name_q
  mingit_url="http://$host_ip:$HOST_PORT/$MINGIT_ZIP_NAME"
  if guest_exec cmd.exe /d /s /c "where git.exe >nul 2>nul && git.exe --version"; then
    return
  fi
-  guest_exec cmd.exe /d /s /c "if exist \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\" rmdir /s /q \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
-  guest_exec cmd.exe /d /s /c "if not exist \"%LOCALAPPDATA%\\OpenClaw\\deps\" mkdir \"%LOCALAPPDATA%\\OpenClaw\\deps\""
-  guest_exec cmd.exe /d /s /c "mkdir \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
-  guest_exec cmd.exe /d /s /c "curl.exe -fsSL \"$mingit_url\" -o \"%TEMP%\\$MINGIT_ZIP_NAME\""
-  guest_exec cmd.exe /d /s /c "tar.exe -xf \"%TEMP%\\$MINGIT_ZIP_NAME\" -C \"%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\""
-  guest_exec cmd.exe /d /s /c "del /q \"%TEMP%\\$MINGIT_ZIP_NAME\" & set \"PATH=%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\cmd;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\mingw64\\bin;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\usr\\bin;%PATH%\" && git.exe --version"
+  mingit_url_q="$(ps_single_quote "$mingit_url")"
+  mingit_name_q="$(ps_single_quote "$MINGIT_ZIP_NAME")"
+  guest_powershell "$(cat <<EOF
+\$depsRoot = Join-Path \$env:LOCALAPPDATA 'OpenClaw\deps'
+\$portableGit = Join-Path \$depsRoot 'portable-git'
+\$archive = Join-Path \$env:TEMP '${mingit_name_q}'
+if (Test-Path \$portableGit) {
+  Remove-Item \$portableGit -Recurse -Force
+}
+New-Item -ItemType Directory -Force -Path \$portableGit | Out-Null
+if (-not (Test-Path \$portableGit)) {
+  throw 'portable git directory missing after create'
+}
+curl.exe -fsSL '${mingit_url_q}' -o \$archive
+tar.exe -xf \$archive -C \$portableGit
+Remove-Item \$archive -Force -ErrorAction SilentlyContinue
+\$env:PATH = "\$portableGit\cmd;\$portableGit\mingw64\bin;\$portableGit\usr\bin;\$env:PATH"
+git.exe --version
+EOF
+)"
 }

 pack_main_tgz() {
@ -869,13 +884,200 @@ EOF
 install_main_tgz() {
  local host_ip="$1"
  local temp_name="$2"
-  local tgz_url
+  local tgz_url script_url
+  local runner_name log_name done_name done_status launcher_state guest_log
+  local start_seconds poll_deadline startup_checked poll_rc state_rc log_rc
+  local log_state_path
  tgz_url="http://$host_ip:$HOST_PORT/$(basename "$MAIN_TGZ_PATH")"
-  # Global npm installs on the Windows guest can stay silent for long stretches.
-  # Treat the phase log plus retry wrapper as the primary signal before assuming
-  # the guest hung.
-  run_windows_retry "main tgz install" 2 \
-    guest_exec cmd.exe /d /s /c "set \"PATH=%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\cmd;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\mingw64\\bin;%LOCALAPPDATA%\\OpenClaw\\deps\\portable-git\\usr\\bin;%PATH%\" && curl.exe -fsSL \"$tgz_url\" -o \"%TEMP%\\$temp_name\" && npm.cmd install -g \"%TEMP%\\$temp_name\" --no-fund --no-audit && \"%APPDATA%\\npm\\openclaw.cmd\" --version"
+  write_install_runner_script
+  script_url="http://$host_ip:$HOST_PORT/$(basename "$WINDOWS_INSTALL_SCRIPT_PATH")"
+  runner_name="openclaw-install-$RANDOM-$RANDOM.ps1"
+  log_name="openclaw-install-$RANDOM-$RANDOM.log"
+  done_name="openclaw-install-$RANDOM-$RANDOM.done"
+  log_state_path="$(mktemp "${TMPDIR:-/tmp}/openclaw-install-log-state.XXXXXX")"
+  : >"$log_state_path"
+  start_seconds="$SECONDS"
+  poll_deadline=$((SECONDS + TIMEOUT_INSTALL_S + 60))
+  startup_checked=0
+
+  guest_powershell "$(cat <<EOF
+\$runner = Join-Path \$env:TEMP '$runner_name'
+\$log = Join-Path \$env:TEMP '$log_name'
+\$done = Join-Path \$env:TEMP '$done_name'
+Remove-Item \$runner, \$log, \$done -Force -ErrorAction SilentlyContinue
+curl.exe -fsSL '$script_url' -o \$runner
+Start-Process powershell.exe -ArgumentList @(
+  '-NoProfile',
+  '-ExecutionPolicy', 'Bypass',
+  '-File', \$runner,
+  '-TgzUrl', '$tgz_url',
+  '-TempName', '$temp_name',
+  '-LogPath', \$log,
+  '-DonePath', \$done
+) -WindowStyle Hidden | Out-Null
+EOF
+)"
+
+  stream_windows_install_log() {
+    set +e
+    guest_log="$(
+      guest_powershell_poll 20 "\$log = Join-Path \$env:TEMP '$log_name'; if (Test-Path \$log) { Get-Content \$log }"
+    )"
+    log_rc=$?
+    set -e
+    if [[ $log_rc -ne 0 ]] || [[ -z "$guest_log" ]]; then
+      return "$log_rc"
+    fi
+    GUEST_LOG="$guest_log" python3 - "$log_state_path" <<'PY'
+import os
+import pathlib
+import sys
+
+state_path = pathlib.Path(sys.argv[1])
+previous = state_path.read_text(encoding="utf-8", errors="replace")
+current = os.environ["GUEST_LOG"].replace("\r\n", "\n").replace("\r", "\n")
+
+if current.startswith(previous):
+    sys.stdout.write(current[len(previous):])
+else:
+    sys.stdout.write(current)
+
+state_path.write_text(current, encoding="utf-8")
+PY
+  }
+
+  while :; do
+    set +e
+    done_status="$(
+      guest_powershell_poll 20 "\$done = Join-Path \$env:TEMP '$done_name'; if (Test-Path \$done) { (Get-Content \$done -Raw).Trim() }"
+    )"
+    poll_rc=$?
+    set -e
+    done_status="${done_status//$'\r'/}"
+    if [[ $poll_rc -ne 0 ]]; then
+      warn "windows install helper poll failed; retrying"
+      if (( SECONDS >= poll_deadline )); then
+        warn "windows install helper timed out while polling done file"
+        rm -f "$log_state_path"
+        return 1
+      fi
+      sleep 2
+      continue
+    fi
+    set +e
+    stream_windows_install_log
+    log_rc=$?
+    set -e
+    if [[ $log_rc -ne 0 ]]; then
+      warn "windows install helper live log poll failed; retrying"
+    fi
+    if [[ -n "$done_status" ]]; then
+      if ! stream_windows_install_log; then
+        warn "windows install helper log drain failed after completion"
+      fi
+      rm -f "$log_state_path"
+      [[ "$done_status" == "0" ]]
+      return $?
+    fi
+    if [[ "$startup_checked" -eq 0 && $((SECONDS - start_seconds)) -ge 20 ]]; then
+      set +e
+      launcher_state="$(
+        guest_powershell_poll 20 "\$runner = Join-Path \$env:TEMP '$runner_name'; \$log = Join-Path \$env:TEMP '$log_name'; \$done = Join-Path \$env:TEMP '$done_name'; 'runner=' + (Test-Path \$runner) + ' log=' + (Test-Path \$log) + ' done=' + (Test-Path \$done)"
+      )"
+      state_rc=$?
+      set -e
+      launcher_state="${launcher_state//$'\r'/}"
+      startup_checked=1
+      if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
+        warn "windows install helper failed to materialize guest files"
+        rm -f "$log_state_path"
+        return 1
+      fi
+    fi
+    if (( SECONDS >= poll_deadline )); then
+      if ! stream_windows_install_log; then
+        warn "windows install helper log drain failed after timeout"
+      fi
+      warn "windows install helper timed out waiting for done file"
+      rm -f "$log_state_path"
+      return 1
+    fi
+    sleep 2
+  done
+}
+
+write_install_runner_script() {
+  WINDOWS_INSTALL_SCRIPT_PATH="$MAIN_TGZ_DIR/openclaw-install-main.ps1"
+  cat >"$WINDOWS_INSTALL_SCRIPT_PATH" <<'EOF'
+param(
+  [Parameter(Mandatory = $true)][string]$TgzUrl,
+  [Parameter(Mandatory = $true)][string]$TempName,
+  [Parameter(Mandatory = $true)][string]$LogPath,
+  [Parameter(Mandatory = $true)][string]$DonePath
+)
+
+$ErrorActionPreference = 'Stop'
+$PSNativeCommandUseErrorActionPreference = $false
+
+function Write-ProgressLog {
+  param([Parameter(Mandatory = $true)][string]$Stage)
+
+  "==> $Stage" | Tee-Object -FilePath $LogPath -Append | Out-Null
+}
+
+function Invoke-Logged {
+  param(
+    [Parameter(Mandatory = $true)][string]$Label,
+    [Parameter(Mandatory = $true)][scriptblock]$Command
+  )
+
+  $output = $null
+  $previousErrorActionPreference = $ErrorActionPreference
+  $previousNativeErrorPreference = $PSNativeCommandUseErrorActionPreference
+  try {
+    $ErrorActionPreference = 'Continue'
+    $PSNativeCommandUseErrorActionPreference = $false
+    $output = & $Command *>&1
+    $exitCode = $LASTEXITCODE
+  } finally {
+    $ErrorActionPreference = $previousErrorActionPreference
+    $PSNativeCommandUseErrorActionPreference = $previousNativeErrorPreference
+  }
+
+  if ($null -ne $output) {
+    $output | Tee-Object -FilePath $LogPath -Append | Out-Null
+  }
+
+  if ($exitCode -ne 0) {
+    throw "$Label failed with exit code $exitCode"
+  }
+}
+
+try {
+  $env:PATH = "$env:LOCALAPPDATA\OpenClaw\deps\portable-git\cmd;$env:LOCALAPPDATA\OpenClaw\deps\portable-git\mingw64\bin;$env:LOCALAPPDATA\OpenClaw\deps\portable-git\usr\bin;$env:PATH"
+  $tgz = Join-Path $env:TEMP $TempName
+  Remove-Item $tgz, $LogPath, $DonePath -Force -ErrorAction SilentlyContinue
+  Write-ProgressLog 'install.start'
+  Write-ProgressLog 'install.download-tgz'
+  Invoke-Logged 'download current tgz' { curl.exe -fsSL $TgzUrl -o $tgz }
+  Write-ProgressLog 'install.install-tgz'
+  Invoke-Logged 'npm install current tgz' { npm.cmd install -g $tgz --no-fund --no-audit }
+  $openclaw = Join-Path $env:APPDATA 'npm\openclaw.cmd'
+  Write-ProgressLog 'install.verify-version'
+  Invoke-Logged 'openclaw --version' { & $openclaw --version }
+  Write-ProgressLog 'install.done'
+  Set-Content -Path $DonePath -Value ([string]0)
+  exit 0
+} catch {
+  if (Test-Path $LogPath) {
+    Add-Content -Path $LogPath -Value ($_ | Out-String)
+  } else {
+    ($_ | Out-String) | Set-Content -Path $LogPath
+  }
+  Set-Content -Path $DonePath -Value '1'
+  exit 1
+}
+EOF
 }

 verify_version_contains() {
--- a/src/commands/onboard-helpers.test.ts
+++ b/src/commands/onboard-helpers.test.ts
@ -3,6 +3,7 @@ import { afterEach, describe, expect, it, vi } from "vitest";
 import {
  normalizeGatewayTokenInput,
  openUrl,
+  probeGatewayReachable,
  resolveBrowserOpenCommand,
  resolveControlUiLinks,
  validateGatewayPasswordInput,
@ -22,6 +23,7 @@ const mocks = vi.hoisted(() => ({
    killed: false,
  })),
  pickPrimaryTailnetIPv4: vi.fn<() => string | undefined>(() => undefined),
+  probeGateway: vi.fn(),
 }));

 vi.mock("../process/exec.js", () => ({
@ -32,6 +34,10 @@ vi.mock("../infra/tailnet.js", () => ({
  pickPrimaryTailnetIPv4: mocks.pickPrimaryTailnetIPv4,
 }));

+vi.mock("../gateway/probe.js", () => ({
+  probeGateway: mocks.probeGateway,
+}));
+
 afterEach(() => {
  vi.restoreAllMocks();
  vi.unstubAllEnvs();
@ -74,6 +80,62 @@ describe("resolveBrowserOpenCommand", () => {
  });
 });

+describe("probeGatewayReachable", () => {
+  it("uses a hello-only probe for onboarding reachability", async () => {
+    mocks.probeGateway.mockResolvedValueOnce({
+      ok: true,
+      url: "ws://127.0.0.1:18789",
+      connectLatencyMs: 42,
+      error: null,
+      close: null,
+      health: null,
+      status: null,
+      presence: null,
+      configSnapshot: null,
+    });
+
+    const result = await probeGatewayReachable({
+      url: "ws://127.0.0.1:18789",
+      token: "tok_test",
+      timeoutMs: 2500,
+    });
+
+    expect(result).toEqual({ ok: true });
+    expect(mocks.probeGateway).toHaveBeenCalledWith({
+      url: "ws://127.0.0.1:18789",
+      timeoutMs: 2500,
+      auth: {
+        token: "tok_test",
+        password: undefined,
+      },
+      detailLevel: "none",
+    });
+  });
+
+  it("returns the probe error detail on failure", async () => {
+    mocks.probeGateway.mockResolvedValueOnce({
+      ok: false,
+      url: "ws://127.0.0.1:18789",
+      connectLatencyMs: null,
+      error: "connect failed: timeout",
+      close: null,
+      health: null,
+      status: null,
+      presence: null,
+      configSnapshot: null,
+    });
+
+    const result = await probeGatewayReachable({
+      url: "ws://127.0.0.1:18789",
+    });
+
+    expect(result).toEqual({
+      ok: false,
+      detail: "connect failed: timeout",
+    });
+  });
+});
+
 describe("resolveControlUiLinks", () => {
  it("uses customBindHost for custom bind", () => {
    const links = resolveControlUiLinks({
--- a/src/commands/onboard-helpers.ts
+++ b/src/commands/onboard-helpers.ts
@ -8,9 +8,9 @@ import type { OpenClawConfig } from "../config/config.js";
 import { CONFIG_PATH } from "../config/config.js";
 import { resolveAgentModelPrimaryValue } from "../config/model-input.js";
 import { resolveSessionTranscriptsDirForAgent } from "../config/sessions.js";
-import { callGateway } from "../gateway/call.js";
 import { resolveControlUiLinks } from "../gateway/control-ui-links.js";
 import { normalizeControlUiBasePath } from "../gateway/control-ui-shared.js";
+import { probeGateway } from "../gateway/probe.js";
 import {
  detectBrowserOpenSupport,
  openUrl,
@ -22,7 +22,6 @@ import { runCommandWithTimeout } from "../process/exec.js";
 import type { RuntimeEnv } from "../runtime.js";
 import { stylePromptTitle } from "../terminal/prompt-style.js";
 import { CONFIG_DIR, shortenHomeInString, shortenHomePath, sleep } from "../utils.js";
-import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
 import { VERSION } from "../version.js";
 import type { NodeManagerChoice, OnboardMode, ResetScope } from "./onboard-types.js";

@ -228,16 +227,16 @@ export async function probeGatewayReachable(params: {
  const url = params.url.trim();
  const timeoutMs = params.timeoutMs ?? 1500;
  try {
-    await callGateway({
+    const probe = await probeGateway({
      url,
-      token: params.token,
-      password: params.password,
-      method: "health",
      timeoutMs,
-      clientName: GATEWAY_CLIENT_NAMES.PROBE,
-      mode: GATEWAY_CLIENT_MODES.PROBE,
+      auth: {
+        token: params.token,
+        password: params.password,
+      },
+      detailLevel: "none",
    });
-    return { ok: true };
+    return probe.ok ? { ok: true } : { ok: false, detail: probe.error ?? undefined };
  } catch (err) {
    return { ok: false, detail: summarizeError(err) };
  }
--- a/src/commands/onboard-non-interactive.gateway.test.ts
+++ b/src/commands/onboard-non-interactive.gateway.test.ts
@ -33,7 +33,13 @@ const readLastGatewayErrorLineMock = vi.hoisted(() =>
  vi.fn(async () => "Gateway failed to start: required secrets are unavailable."),
 );
 let waitForGatewayReachableMock:
-  | ((params: { url: string; token?: string; password?: string; deadlineMs?: number }) => Promise<{
+  | ((params: {
+      url: string;
+      token?: string;
+      password?: string;
+      deadlineMs?: number;
+      probeTimeoutMs?: number;
+    }) => Promise<{
      ok: boolean;
      detail?: string;
    }>)
@ -479,10 +485,20 @@ describe("onboard (non-interactive): gateway and remote auth", () => {
  it("uses a longer health deadline when daemon install was requested", async () => {
    await withStateDir("state-local-daemon-health-", async (stateDir) => {
      let capturedDeadlineMs: number | undefined;
-      waitForGatewayReachableMock = vi.fn(async (params: { deadlineMs?: number }) => {
-        capturedDeadlineMs = params.deadlineMs;
-        return { ok: true };
-      });
+      let capturedProbeTimeoutMs: number | undefined;
+      waitForGatewayReachableMock = vi.fn(
+        async (params: {
+          url: string;
+          token?: string;
+          password?: string;
+          deadlineMs?: number;
+          probeTimeoutMs?: number;
+        }) => {
+          capturedDeadlineMs = params.deadlineMs;
+          capturedProbeTimeoutMs = params.probeTimeoutMs;
+          return { ok: true };
+        },
+      );

      await runNonInteractiveSetup(
        {
@ -500,6 +516,7 @@ describe("onboard (non-interactive): gateway and remote auth", () => {

      expect(installGatewayDaemonNonInteractiveMock).toHaveBeenCalledTimes(1);
      expect(capturedDeadlineMs).toBe(45_000);
+      expect(capturedProbeTimeoutMs).toBe(10_000);
    });
  }, 60_000);

--- a/src/commands/onboard-non-interactive/local.ts
+++ b/src/commands/onboard-non-interactive/local.ts
@ -25,6 +25,7 @@ import { resolveNonInteractiveWorkspaceDir } from "./local/workspace.js";

 const INSTALL_DAEMON_HEALTH_DEADLINE_MS = 45_000;
 const ATTACH_EXISTING_GATEWAY_HEALTH_DEADLINE_MS = 15_000;
+const INSTALL_DAEMON_HEALTH_PROBE_TIMEOUT_MS = 10_000;

 async function collectGatewayHealthFailureDiagnostics(): Promise<
  GatewayHealthFailureDiagnostics | undefined
@ -211,6 +212,7 @@ export async function runNonInteractiveLocalSetup(params: {
      deadlineMs: opts.installDaemon
        ? INSTALL_DAEMON_HEALTH_DEADLINE_MS
        : ATTACH_EXISTING_GATEWAY_HEALTH_DEADLINE_MS,
+      probeTimeoutMs: opts.installDaemon ? INSTALL_DAEMON_HEALTH_PROBE_TIMEOUT_MS : undefined,
    });
    if (!probe.ok) {
      const diagnostics = opts.installDaemon