diff --git a/agent/tools/retreiveExamples.ts b/agent/tools/retreiveExamples.ts index 58067a6..fbbcdba 100644 --- a/agent/tools/retreiveExamples.ts +++ b/agent/tools/retreiveExamples.ts @@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json"; const JSONL_PATH = "../data/input.jsonl" +const BM25_MIN_DOCS = 3; + type EmbeddingCache = { rawtexts: string[]; cleantexts: string[]; @@ -287,8 +289,20 @@ async function embedText(text: string): Promise { } function buildBM25(texts: string[]) { - logger.info("Building BM25 index (%s docs)...", texts.length); + let paddedTexts = texts; + if (texts.length < BM25_MIN_DOCS) { + const needed = BM25_MIN_DOCS - texts.length; + logger.error( + "Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)", + texts.length, + BM25_MIN_DOCS, + needed + ); + paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")]; + } + + logger.info("Building BM25 index (%s docs)...", paddedTexts.length); const bm25 = bm25Factory(); bm25.defineConfig({ @@ -302,7 +316,7 @@ function buildBM25(texts: string[]) { nlp.tokens.removeWords, ]); - texts.forEach((text, i) => { + paddedTexts.forEach((text, i) => { bm25.addDoc({ text }, i); }); diff --git a/agent/tools/webpageFetch.ts b/agent/tools/webpageFetch.ts index 74c9fd6..20c2874 100644 --- a/agent/tools/webpageFetch.ts +++ b/agent/tools/webpageFetch.ts @@ -1,32 +1,92 @@ import { Builder, Browser } from "selenium-webdriver"; import firefox from "selenium-webdriver/firefox"; +import { backOff } from "exponential-backoff"; +import { logger } from "../utils/logger"; -export async function extractWebpageContent(url: string) : Promise{ - const options = new firefox.Options(); - options.addArguments("--headless"); - - let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build() - try { - await driver.get(url) - await driver.wait(async () => { - return await driver.executeScript( - "return document.readyState === 'complete'" - ); - }, 5000); - - const readableText = await driver.executeScript( - "return document.body.innerText;" - ) as string; - - const filteredLines = readableText - .split(/\r?\n/) - .map(line => line.trim()) - .filter(line => line.split(/\s+/).length > 1); - - return filteredLines; - } finally { - await driver.quit() - } +export async function extractWebpageContent(url: string): Promise { + try { + const response = await backOff(async () => { + return await extractWebpageContentWorker(url); + }, { + numOfAttempts: 10, + startingDelay: 500, + timeMultiple: 2, + jitter: "full", + maxDelay: 50000, + }); + return response; + } catch (err: any) { + logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`); + return ["API EXCEPTION"]; + } } -//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt")) \ No newline at end of file +async function extractWebpageContentWorker(url: string): Promise { + let driver; + try { + const options = new firefox.Options(); + options.addArguments("--headless"); + driver = await new Builder() + .forBrowser(Browser.FIREFOX) + .setFirefoxOptions(options) + .build(); + } catch (err: any) { + const desc = `Failed to launch Firefox driver: ${err.message}`; + logger.error(desc); + throw new Error(desc); + } + + try { + try { + await driver.get(url); + } catch (err: any) { + const desc = `Failed to navigate to URL "${url}": ${err.message}`; + logger.error(desc); + throw new Error(desc); + } + + try { + await driver.wait(async () => { + return await driver.executeScript( + "return document.readyState === 'complete'" + ); + }, 5000); + } catch (err: any) { + logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`); + // do not throw, attempt to read + } + + let readableText: string; + try { + readableText = await driver.executeScript( + "return document.body.innerText;" + ) as string; + } catch (err: any) { + const desc = `Failed to extract page text from "${url}": ${err.message}`; + logger.error(desc); + throw new Error(desc); + } + + const filteredLines = readableText + .split(/\r?\n/) + .map(line => line.trim()) + .filter(line => line.split(/\s+/).length > 1); + + if (filteredLines.length === 0) { + const desc = `No content extracted from "${url}"`; + logger.error(desc); + throw new Error(desc); + } + + return filteredLines; + } finally { + try { + await driver.quit(); + } catch (err: any) { + logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`); + } + } +} + +// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt")) +// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/")) \ No newline at end of file diff --git a/supporting/Wrapper/run.ts b/supporting/Wrapper/run.ts index 7e362bb..50fcaf9 100644 --- a/supporting/Wrapper/run.ts +++ b/supporting/Wrapper/run.ts @@ -118,7 +118,7 @@ async function processRecord(record: any): Promise { input: buildAgentInput(record), streamMode: "values", config: { - recursion_limit: 50 + recursion_limit: 100 } });