fix: stop stale Windows gateway before upgrade onboard

This commit is contained in:
Peter Steinberger 2026-04-01 14:52:10 +01:00
parent 38410705bf
commit 9065243729
No known key found for this signature in database
2 changed files with 26 additions and 13 deletions

View File

@ -66,7 +66,8 @@ Use this skill for Parallels guest workflows and smoke interpretation. Do not lo
- Windows global `npm install -g` phases can stay quiet for a minute or more even when healthy; inspect the phase log before calling it hung, and only treat it as a regression once the retry wrapper or timeout trips.
- Fresh Windows ref-mode onboard should use the same background PowerShell runner plus done-file/log-drain pattern as the npm-update helper, including startup materialization checks, host-side timeouts on short poll `prlctl exec` calls, and retry-on-poll-failure behavior for transient transport flakes.
- Fresh Windows ref-mode agent verification should set `OPENAI_API_KEY` in the PowerShell environment before invoking `openclaw.cmd agent`, for the same pairing-required fallback reason as macOS.
- The Windows upgrade smoke lane should restart the managed gateway after `upgrade.install-main` and before `upgrade.onboard-ref`, or the old process can keep the previous gateway token and fail `gateway-health` with `unauthorized: gateway token mismatch`.
- The standalone Windows upgrade smoke lane should stop the managed gateway after `upgrade.install-main` and before `upgrade.onboard-ref`. Restarting before onboard can leave the old process alive on the pre-onboard token while onboard rewrites `~/.openclaw/openclaw.json`, which then fails `gateway-health` with `unauthorized: gateway token mismatch`.
- If standalone Windows upgrade fails with a gateway token mismatch but `pnpm test:parallels:npm-update` passes, trust the mismatch as a standalone ref-onboard ordering bug first; the npm-update helper does not re-run ref-mode onboard on the same guest.
- Keep onboarding and status output ASCII-clean in logs; fancy punctuation becomes mojibake in current capture paths.
- If you hit an older run with `rc=255` plus an empty `fresh.install-main.log` or `upgrade.install-main.log`, treat it as a likely `prlctl exec` transport drop after guest start-up, not immediate proof of an npm/package failure.

View File

@ -984,12 +984,13 @@ verify_gateway() {
guest_run_openclaw "" "" gateway status --deep --require-rpc
}
restart_gateway() {
run_gateway_daemon_action() {
local action="$1"
local runner_name log_name done_name done_status launcher_state
local poll_rc state_rc log_rc start_seconds poll_deadline startup_checked
runner_name="openclaw-gateway-restart-$RANDOM-$RANDOM.ps1"
log_name="openclaw-gateway-restart-$RANDOM-$RANDOM.log"
done_name="openclaw-gateway-restart-$RANDOM-$RANDOM.done"
runner_name="openclaw-gateway-$action-$RANDOM-$RANDOM.ps1"
log_name="openclaw-gateway-$action-$RANDOM-$RANDOM.log"
done_name="openclaw-gateway-$action-$RANDOM-$RANDOM.done"
start_seconds="$SECONDS"
poll_deadline=$((SECONDS + TIMEOUT_GATEWAY_S + 60))
startup_checked=0
@ -1006,7 +1007,7 @@ Remove-Item \$runner, \$log, \$done -Force -ErrorAction SilentlyContinue
\$done = Join-Path \$env:TEMP '$done_name'
try {
\$openclaw = Join-Path \$env:APPDATA 'npm\openclaw.cmd'
& \$openclaw gateway restart *>&1 | Tee-Object -FilePath \$log -Append | Out-Null
& \$openclaw gateway $action *>&1 | Tee-Object -FilePath \$log -Append | Out-Null
Set-Content -Path \$done -Value ([string]\$LASTEXITCODE)
} catch {
if (Test-Path \$log) {
@ -1030,9 +1031,9 @@ EOF
set -e
done_status="${done_status//$'\r'/}"
if [[ $poll_rc -ne 0 ]]; then
warn "windows gateway restart helper poll failed; retrying"
warn "windows gateway $action helper poll failed; retrying"
if (( SECONDS >= poll_deadline )); then
warn "windows gateway restart helper timed out while polling done file"
warn "windows gateway $action helper timed out while polling done file"
return 1
fi
sleep 2
@ -1044,7 +1045,7 @@ EOF
log_rc=$?
set -e
if [[ $log_rc -ne 0 ]]; then
warn "windows gateway restart helper log drain failed after completion"
warn "windows gateway $action helper log drain failed after completion"
fi
[[ "$done_status" == "0" ]]
return $?
@ -1059,7 +1060,7 @@ EOF
launcher_state="${launcher_state//$'\r'/}"
startup_checked=1
if [[ $state_rc -eq 0 && "$launcher_state" == *"runner=False"* && "$launcher_state" == *"log=False"* && "$launcher_state" == *"done=False"* ]]; then
warn "windows gateway restart helper failed to materialize guest files"
warn "windows gateway $action helper failed to materialize guest files"
return 1
fi
fi
@ -1069,15 +1070,23 @@ EOF
log_rc=$?
set -e
if [[ $log_rc -ne 0 ]]; then
warn "windows gateway restart helper log drain failed after timeout"
warn "windows gateway $action helper log drain failed after timeout"
fi
warn "windows gateway restart helper timed out waiting for done file"
warn "windows gateway $action helper timed out waiting for done file"
return 1
fi
sleep 2
done
}
restart_gateway() {
run_gateway_daemon_action restart
}
stop_gateway() {
run_gateway_daemon_action stop
}
show_gateway_status_compat() {
if guest_run_openclaw "" "" gateway status --help | grep -Fq -- "--require-rpc"; then
guest_run_openclaw "" "" gateway status --deep --require-rpc
@ -1145,7 +1154,10 @@ run_upgrade_lane() {
phase_run "upgrade.install-main" "$TIMEOUT_INSTALL_S" install_main_tgz "$host_ip" "openclaw-main-upgrade.tgz" || return $?
UPGRADE_MAIN_VERSION="$(extract_last_version "$(phase_log_path upgrade.install-main)")"
phase_run "upgrade.verify-main-version" "$TIMEOUT_VERIFY_S" verify_target_version || return $?
phase_run "upgrade.gateway-restart" "$TIMEOUT_GATEWAY_S" restart_gateway || return $?
# Stop the old managed gateway before ref-mode onboard rewrites config and
# gateway auth. Restarting first can leave the old token alive and make the
# onboard health probe fail against a stale daemon.
phase_run "upgrade.gateway-stop" "$TIMEOUT_GATEWAY_S" stop_gateway || return $?
phase_run "upgrade.onboard-ref" "$TIMEOUT_ONBOARD_S" run_ref_onboard || return $?
phase_run "upgrade.gateway-status" "$TIMEOUT_GATEWAY_S" verify_gateway || return $?
UPGRADE_GATEWAY_STATUS="pass"