Fix errors seen during longer runs: selenium exceptions, insecure certificates, recusrsion limit exceeded, BM25 document corpus too small

This commit is contained in:
William Jeynes
2026-03-26 12:22:13 +00:00
parent fbc688b8f9
commit 5e374a8bd6
3 changed files with 104 additions and 30 deletions
+16 -2
View File
@@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json";
const JSONL_PATH = "../data/input.jsonl"
const BM25_MIN_DOCS = 3;
type EmbeddingCache = {
rawtexts: string[];
cleantexts: string[];
@@ -287,8 +289,20 @@ async function embedText(text: string): Promise<number[]> {
}
function buildBM25(texts: string[]) {
logger.info("Building BM25 index (%s docs)...", texts.length);
let paddedTexts = texts;
if (texts.length < BM25_MIN_DOCS) {
const needed = BM25_MIN_DOCS - texts.length;
logger.error(
"Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)",
texts.length,
BM25_MIN_DOCS,
needed
);
paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")];
}
logger.info("Building BM25 index (%s docs)...", paddedTexts.length);
const bm25 = bm25Factory();
bm25.defineConfig({
@@ -302,7 +316,7 @@ function buildBM25(texts: string[]) {
nlp.tokens.removeWords,
]);
texts.forEach((text, i) => {
paddedTexts.forEach((text, i) => {
bm25.addDoc({ text }, i);
});
+64 -4
View File
@@ -1,32 +1,92 @@
import { Builder, Browser } from "selenium-webdriver";
import firefox from "selenium-webdriver/firefox";
import { backOff } from "exponential-backoff";
import { logger } from "../utils/logger";
export async function extractWebpageContent(url: string): Promise<string[]> {
try {
const response = await backOff(async () => {
return await extractWebpageContentWorker(url);
}, {
numOfAttempts: 10,
startingDelay: 500,
timeMultiple: 2,
jitter: "full",
maxDelay: 50000,
});
return response;
} catch (err: any) {
logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
return ["API EXCEPTION"];
}
}
async function extractWebpageContentWorker(url: string): Promise<string[]> {
let driver;
try {
const options = new firefox.Options();
options.addArguments("--headless");
driver = await new Builder()
.forBrowser(Browser.FIREFOX)
.setFirefoxOptions(options)
.build();
} catch (err: any) {
const desc = `Failed to launch Firefox driver: ${err.message}`;
logger.error(desc);
throw new Error(desc);
}
try {
try {
await driver.get(url);
} catch (err: any) {
const desc = `Failed to navigate to URL "${url}": ${err.message}`;
logger.error(desc);
throw new Error(desc);
}
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
try {
await driver.get(url)
await driver.wait(async () => {
return await driver.executeScript(
"return document.readyState === 'complete'"
);
}, 5000);
} catch (err: any) {
logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
// do not throw, attempt to read
}
const readableText = await driver.executeScript(
let readableText: string;
try {
readableText = await driver.executeScript(
"return document.body.innerText;"
) as string;
} catch (err: any) {
const desc = `Failed to extract page text from "${url}": ${err.message}`;
logger.error(desc);
throw new Error(desc);
}
const filteredLines = readableText
.split(/\r?\n/)
.map(line => line.trim())
.filter(line => line.split(/\s+/).length > 1);
if (filteredLines.length === 0) {
const desc = `No content extracted from "${url}"`;
logger.error(desc);
throw new Error(desc);
}
return filteredLines;
} finally {
await driver.quit()
try {
await driver.quit();
} catch (err: any) {
logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
}
}
}
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
+1 -1
View File
@@ -118,7 +118,7 @@ async function processRecord(record: any): Promise<ResultRecord> {
input: buildAgentInput(record),
streamMode: "values",
config: {
recursion_limit: 50
recursion_limit: 100
}
});