Fix errors seen during longer runs: selenium exceptions, insecure certificates, recusrsion limit exceeded, BM25 document corpus too small
This commit is contained in:
+87
-27
@@ -1,32 +1,92 @@
|
||||
import { Builder, Browser } from "selenium-webdriver";
|
||||
import firefox from "selenium-webdriver/firefox";
|
||||
import { backOff } from "exponential-backoff";
|
||||
import { logger } from "../utils/logger";
|
||||
|
||||
export async function extractWebpageContent(url: string) : Promise<string[]>{
|
||||
const options = new firefox.Options();
|
||||
options.addArguments("--headless");
|
||||
|
||||
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
|
||||
try {
|
||||
await driver.get(url)
|
||||
await driver.wait(async () => {
|
||||
return await driver.executeScript(
|
||||
"return document.readyState === 'complete'"
|
||||
);
|
||||
}, 5000);
|
||||
|
||||
const readableText = await driver.executeScript(
|
||||
"return document.body.innerText;"
|
||||
) as string;
|
||||
|
||||
const filteredLines = readableText
|
||||
.split(/\r?\n/)
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.split(/\s+/).length > 1);
|
||||
|
||||
return filteredLines;
|
||||
} finally {
|
||||
await driver.quit()
|
||||
}
|
||||
export async function extractWebpageContent(url: string): Promise<string[]> {
|
||||
try {
|
||||
const response = await backOff(async () => {
|
||||
return await extractWebpageContentWorker(url);
|
||||
}, {
|
||||
numOfAttempts: 10,
|
||||
startingDelay: 500,
|
||||
timeMultiple: 2,
|
||||
jitter: "full",
|
||||
maxDelay: 50000,
|
||||
});
|
||||
return response;
|
||||
} catch (err: any) {
|
||||
logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
|
||||
return ["API EXCEPTION"];
|
||||
}
|
||||
}
|
||||
|
||||
//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||
async function extractWebpageContentWorker(url: string): Promise<string[]> {
|
||||
let driver;
|
||||
try {
|
||||
const options = new firefox.Options();
|
||||
options.addArguments("--headless");
|
||||
driver = await new Builder()
|
||||
.forBrowser(Browser.FIREFOX)
|
||||
.setFirefoxOptions(options)
|
||||
.build();
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to launch Firefox driver: ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
try {
|
||||
try {
|
||||
await driver.get(url);
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to navigate to URL "${url}": ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
try {
|
||||
await driver.wait(async () => {
|
||||
return await driver.executeScript(
|
||||
"return document.readyState === 'complete'"
|
||||
);
|
||||
}, 5000);
|
||||
} catch (err: any) {
|
||||
logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
|
||||
// do not throw, attempt to read
|
||||
}
|
||||
|
||||
let readableText: string;
|
||||
try {
|
||||
readableText = await driver.executeScript(
|
||||
"return document.body.innerText;"
|
||||
) as string;
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to extract page text from "${url}": ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
const filteredLines = readableText
|
||||
.split(/\r?\n/)
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.split(/\s+/).length > 1);
|
||||
|
||||
if (filteredLines.length === 0) {
|
||||
const desc = `No content extracted from "${url}"`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
return filteredLines;
|
||||
} finally {
|
||||
try {
|
||||
await driver.quit();
|
||||
} catch (err: any) {
|
||||
logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||
// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
|
||||
Reference in New Issue
Block a user