openclaw/extensions/firecrawl/src/firecrawl-client.ts

416 lines
12 KiB
TypeScript

import { markdownToText, truncateText } from "openclaw/plugin-sdk/agent-runtime";
import type { OpenClawConfig } from "openclaw/plugin-sdk/config-runtime";
import {
DEFAULT_CACHE_TTL_MINUTES,
normalizeCacheKey,
postTrustedWebToolsJson,
readCache,
resolveCacheTtlMs,
writeCache,
} from "openclaw/plugin-sdk/provider-web-search";
import { wrapExternalContent, wrapWebContent } from "openclaw/plugin-sdk/security-runtime";
import {
resolveFirecrawlApiKey,
resolveFirecrawlBaseUrl,
resolveFirecrawlMaxAgeMs,
resolveFirecrawlOnlyMainContent,
resolveFirecrawlScrapeTimeoutSeconds,
resolveFirecrawlSearchTimeoutSeconds,
} from "./config.js";
const SEARCH_CACHE = new Map<
string,
{ value: Record<string, unknown>; expiresAt: number; insertedAt: number }
>();
const SCRAPE_CACHE = new Map<
string,
{ value: Record<string, unknown>; expiresAt: number; insertedAt: number }
>();
const DEFAULT_SEARCH_COUNT = 5;
const DEFAULT_SCRAPE_MAX_CHARS = 50_000;
type FirecrawlSearchItem = {
title: string;
url: string;
description?: string;
content?: string;
published?: string;
siteName?: string;
};
export type FirecrawlSearchParams = {
cfg?: OpenClawConfig;
query: string;
count?: number;
timeoutSeconds?: number;
sources?: string[];
categories?: string[];
scrapeResults?: boolean;
};
export type FirecrawlScrapeParams = {
cfg?: OpenClawConfig;
url: string;
extractMode: "markdown" | "text";
maxChars?: number;
onlyMainContent?: boolean;
maxAgeMs?: number;
proxy?: "auto" | "basic" | "stealth";
storeInCache?: boolean;
timeoutSeconds?: number;
};
function resolveEndpoint(baseUrl: string, pathname: "/v2/search" | "/v2/scrape"): string {
const trimmed = baseUrl.trim();
if (!trimmed) {
return new URL(pathname, "https://api.firecrawl.dev").toString();
}
try {
const url = new URL(trimmed);
if (url.pathname && url.pathname !== "/") {
return url.toString();
}
url.pathname = pathname;
return url.toString();
} catch {
return new URL(pathname, "https://api.firecrawl.dev").toString();
}
}
function resolveSiteName(urlRaw: string): string | undefined {
try {
const host = new URL(urlRaw).hostname.replace(/^www\./, "");
return host || undefined;
} catch {
return undefined;
}
}
function resolveSearchItems(payload: Record<string, unknown>): FirecrawlSearchItem[] {
const candidates = [
payload.data,
payload.results,
(payload.data as { results?: unknown } | undefined)?.results,
(payload.data as { data?: unknown } | undefined)?.data,
(payload.data as { web?: unknown } | undefined)?.web,
(payload.web as { results?: unknown } | undefined)?.results,
];
const rawItems = candidates.find((candidate) => Array.isArray(candidate));
if (!Array.isArray(rawItems)) {
return [];
}
const items: FirecrawlSearchItem[] = [];
for (const entry of rawItems) {
if (!entry || typeof entry !== "object") {
continue;
}
const record = entry as Record<string, unknown>;
const metadata =
record.metadata && typeof record.metadata === "object"
? (record.metadata as Record<string, unknown>)
: undefined;
const url =
(typeof record.url === "string" && record.url) ||
(typeof record.sourceURL === "string" && record.sourceURL) ||
(typeof record.sourceUrl === "string" && record.sourceUrl) ||
(typeof metadata?.sourceURL === "string" && metadata.sourceURL) ||
"";
if (!url) {
continue;
}
const title =
(typeof record.title === "string" && record.title) ||
(typeof metadata?.title === "string" && metadata.title) ||
"";
const description =
(typeof record.description === "string" && record.description) ||
(typeof record.snippet === "string" && record.snippet) ||
(typeof record.summary === "string" && record.summary) ||
undefined;
const content =
(typeof record.markdown === "string" && record.markdown) ||
(typeof record.content === "string" && record.content) ||
(typeof record.text === "string" && record.text) ||
undefined;
const published =
(typeof record.publishedDate === "string" && record.publishedDate) ||
(typeof record.published === "string" && record.published) ||
(typeof metadata?.publishedTime === "string" && metadata.publishedTime) ||
(typeof metadata?.publishedDate === "string" && metadata.publishedDate) ||
undefined;
items.push({
title,
url,
description,
content,
published,
siteName: resolveSiteName(url),
});
}
return items;
}
function buildSearchPayload(params: {
query: string;
provider: "firecrawl";
items: FirecrawlSearchItem[];
tookMs: number;
scrapeResults: boolean;
}): Record<string, unknown> {
return {
query: params.query,
provider: params.provider,
count: params.items.length,
tookMs: params.tookMs,
externalContent: {
untrusted: true,
source: "web_search",
provider: params.provider,
wrapped: true,
},
results: params.items.map((entry) => ({
title: entry.title ? wrapWebContent(entry.title, "web_search") : "",
url: entry.url,
description: entry.description ? wrapWebContent(entry.description, "web_search") : "",
...(entry.published ? { published: entry.published } : {}),
...(entry.siteName ? { siteName: entry.siteName } : {}),
...(params.scrapeResults && entry.content
? { content: wrapWebContent(entry.content, "web_search") }
: {}),
})),
};
}
export async function runFirecrawlSearch(
params: FirecrawlSearchParams,
): Promise<Record<string, unknown>> {
const apiKey = resolveFirecrawlApiKey(params.cfg);
if (!apiKey) {
throw new Error(
"web_search (firecrawl) needs a Firecrawl API key. Set FIRECRAWL_API_KEY in the Gateway environment, or configure plugins.entries.firecrawl.config.webSearch.apiKey.",
);
}
const count =
typeof params.count === "number" && Number.isFinite(params.count)
? Math.max(1, Math.min(10, Math.floor(params.count)))
: DEFAULT_SEARCH_COUNT;
const timeoutSeconds = resolveFirecrawlSearchTimeoutSeconds(params.timeoutSeconds);
const scrapeResults = params.scrapeResults === true;
const sources = Array.isArray(params.sources) ? params.sources.filter(Boolean) : [];
const categories = Array.isArray(params.categories) ? params.categories.filter(Boolean) : [];
const baseUrl = resolveFirecrawlBaseUrl(params.cfg);
const cacheKey = normalizeCacheKey(
JSON.stringify({
type: "firecrawl-search",
q: params.query,
count,
baseUrl,
sources,
categories,
scrapeResults,
}),
);
const cached = readCache(SEARCH_CACHE, cacheKey);
if (cached) {
return { ...cached.value, cached: true };
}
const body: Record<string, unknown> = {
query: params.query,
limit: count,
};
if (sources.length > 0) {
body.sources = sources;
}
if (categories.length > 0) {
body.categories = categories;
}
if (scrapeResults) {
body.scrapeOptions = {
formats: ["markdown"],
};
}
const start = Date.now();
const payload = await postTrustedWebToolsJson(
{
url: resolveEndpoint(baseUrl, "/v2/search"),
timeoutSeconds,
apiKey,
body,
errorLabel: "Firecrawl Search",
},
async (response) => {
const payload = (await response.json()) as Record<string, unknown>;
if (payload.success === false) {
const error =
typeof payload.error === "string"
? payload.error
: typeof payload.message === "string"
? payload.message
: "unknown error";
throw new Error(`Firecrawl Search API error: ${error}`);
}
return payload;
},
);
const result = buildSearchPayload({
query: params.query,
provider: "firecrawl",
items: resolveSearchItems(payload),
tookMs: Date.now() - start,
scrapeResults,
});
writeCache(
SEARCH_CACHE,
cacheKey,
result,
resolveCacheTtlMs(undefined, DEFAULT_CACHE_TTL_MINUTES),
);
return result;
}
function resolveScrapeData(payload: Record<string, unknown>): Record<string, unknown> {
const data = payload.data;
if (data && typeof data === "object") {
return data as Record<string, unknown>;
}
return {};
}
export function parseFirecrawlScrapePayload(params: {
payload: Record<string, unknown>;
url: string;
extractMode: "markdown" | "text";
maxChars: number;
}): Record<string, unknown> {
const data = resolveScrapeData(params.payload);
const metadata =
data.metadata && typeof data.metadata === "object"
? (data.metadata as Record<string, unknown>)
: undefined;
const markdown =
(typeof data.markdown === "string" && data.markdown) ||
(typeof data.content === "string" && data.content) ||
"";
if (!markdown) {
throw new Error("Firecrawl scrape returned no content.");
}
const rawText = params.extractMode === "text" ? markdownToText(markdown) : markdown;
const truncated = truncateText(rawText, params.maxChars);
return {
url: params.url,
finalUrl:
(typeof metadata?.sourceURL === "string" && metadata.sourceURL) ||
(typeof data.url === "string" && data.url) ||
params.url,
status:
(typeof metadata?.statusCode === "number" && metadata.statusCode) ||
(typeof data.statusCode === "number" && data.statusCode) ||
undefined,
title:
typeof metadata?.title === "string" && metadata.title
? wrapExternalContent(metadata.title, { source: "web_fetch", includeWarning: false })
: undefined,
extractor: "firecrawl",
extractMode: params.extractMode,
externalContent: {
untrusted: true,
source: "web_fetch",
wrapped: true,
},
truncated: truncated.truncated,
rawLength: rawText.length,
wrappedLength: wrapExternalContent(truncated.text, {
source: "web_fetch",
includeWarning: false,
}).length,
text: wrapExternalContent(truncated.text, {
source: "web_fetch",
includeWarning: false,
}),
warning:
typeof params.payload.warning === "string" && params.payload.warning
? wrapExternalContent(params.payload.warning, {
source: "web_fetch",
includeWarning: false,
})
: undefined,
};
}
export async function runFirecrawlScrape(
params: FirecrawlScrapeParams,
): Promise<Record<string, unknown>> {
const apiKey = resolveFirecrawlApiKey(params.cfg);
if (!apiKey) {
throw new Error(
"firecrawl_scrape needs a Firecrawl API key. Set FIRECRAWL_API_KEY in the Gateway environment, or configure tools.web.fetch.firecrawl.apiKey.",
);
}
const baseUrl = resolveFirecrawlBaseUrl(params.cfg);
const timeoutSeconds = resolveFirecrawlScrapeTimeoutSeconds(params.cfg, params.timeoutSeconds);
const onlyMainContent = resolveFirecrawlOnlyMainContent(params.cfg, params.onlyMainContent);
const maxAgeMs = resolveFirecrawlMaxAgeMs(params.cfg, params.maxAgeMs);
const proxy = params.proxy ?? "auto";
const storeInCache = params.storeInCache ?? true;
const maxChars =
typeof params.maxChars === "number" && Number.isFinite(params.maxChars) && params.maxChars > 0
? Math.floor(params.maxChars)
: DEFAULT_SCRAPE_MAX_CHARS;
const cacheKey = normalizeCacheKey(
JSON.stringify({
type: "firecrawl-scrape",
url: params.url,
extractMode: params.extractMode,
baseUrl,
onlyMainContent,
maxAgeMs,
proxy,
storeInCache,
maxChars,
}),
);
const cached = readCache(SCRAPE_CACHE, cacheKey);
if (cached) {
return { ...cached.value, cached: true };
}
const payload = await postTrustedWebToolsJson(
{
url: resolveEndpoint(baseUrl, "/v2/scrape"),
timeoutSeconds,
apiKey,
errorLabel: "Firecrawl",
body: {
url: params.url,
formats: ["markdown"],
onlyMainContent,
timeout: timeoutSeconds * 1000,
maxAge: maxAgeMs,
proxy,
storeInCache,
},
},
async (response) => (await response.json()) as Record<string, unknown>,
);
const result = parseFirecrawlScrapePayload({
payload,
url: params.url,
extractMode: params.extractMode,
maxChars,
});
writeCache(
SCRAPE_CACHE,
cacheKey,
result,
resolveCacheTtlMs(undefined, DEFAULT_CACHE_TTL_MINUTES),
);
return result;
}
export const __testing = {
parseFirecrawlScrapePayload,
resolveSearchItems,
};