Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b37799b3d2 | |||
| 10f2644408 | |||
| 7e586fe17d | |||
| 7e37a22058 | |||
| 2ed47980ef | |||
| 01b04dd73e | |||
| 593baf9b15 | |||
| 893829e599 | |||
| 36c30a427d | |||
| b610e8c989 | |||
| f8d4155b7c | |||
| 5e374a8bd6 | |||
| fbc688b8f9 |
@@ -4,3 +4,4 @@ LANGSMITH_API_KEY=123456
|
|||||||
LANGSMITH_ENDPOINT=https://eu.api.smith.langchain.com
|
LANGSMITH_ENDPOINT=https://eu.api.smith.langchain.com
|
||||||
SCRAPER_INSTANCE=https://example.com
|
SCRAPER_INSTANCE=https://example.com
|
||||||
SCRAPER_PARAM_ANYTHING=else
|
SCRAPER_PARAM_ANYTHING=else
|
||||||
|
RANKING_URL=http://localhost:8000/evaluate
|
||||||
+10
-7
@@ -1,25 +1,28 @@
|
|||||||
import { HumanMessage, SystemMessage } from "@langchain/core/messages";
|
import { SystemMessage } from "@langchain/core/messages";
|
||||||
import { GraphNode } from "@langchain/langgraph";
|
import { GraphNode } from "@langchain/langgraph";
|
||||||
import { MessagesState } from "../state";
|
import { MessagesState } from "../state";
|
||||||
import { ChatOpenAI } from "@langchain/openai"
|
import { ChatOllama } from "@langchain/ollama";
|
||||||
import { hydratePrompt } from "../prompts/hydratePrompt";
|
import { hydratePrompt } from "../prompts/hydratePrompt";
|
||||||
|
import { logger } from "../utils/logger";
|
||||||
|
|
||||||
export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
|
export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
|
||||||
return async (state) => {
|
return async (state) => {
|
||||||
const sysPrompt = await hydratePrompt(promptPath, state);
|
const sysPrompt = await hydratePrompt(promptPath, state);
|
||||||
|
|
||||||
const model = new ChatOpenAI({
|
const model = new ChatOllama({
|
||||||
model: "gpt-5-mini"
|
model: "llama3.1:8b-instruct-q4_K_M",
|
||||||
|
temperature: 0.3
|
||||||
});
|
});
|
||||||
|
|
||||||
const modelWithTools = model.bindTools(Object.values(tools));
|
const modelWithTools = model.bindTools(Object.values(tools));
|
||||||
|
|
||||||
const response = await modelWithTools.invoke([
|
const response = await modelWithTools.invoke([
|
||||||
new SystemMessage(
|
new SystemMessage(sysPrompt),
|
||||||
sysPrompt
|
|
||||||
),
|
|
||||||
...state.messages,
|
...state.messages,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
logger.error(response);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
messages: [response]
|
messages: [response]
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,8 +3,16 @@ import { MessagesState } from "../state";
|
|||||||
import { AIMessage, BaseMessage } from "@langchain/core/messages";
|
import { AIMessage, BaseMessage } from "@langchain/core/messages";
|
||||||
import { rankExampleTriggerEvents } from "../tools/retreiveExamples";
|
import { rankExampleTriggerEvents } from "../tools/retreiveExamples";
|
||||||
|
|
||||||
|
function extractTE(text: string) {
|
||||||
|
const match = text.match(/<norm>([\s\S]*?)<\/norm>/);
|
||||||
|
if (!match) throw new Error("Nothing found between <norm> tags");
|
||||||
|
return match[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
|
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||||
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
|
let raw = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
|
||||||
|
let nc = extractTE(raw.toString())
|
||||||
|
|
||||||
//Now give in-context examples. hopwfully we can self-teach?
|
//Now give in-context examples. hopwfully we can self-teach?
|
||||||
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
|
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
|
||||||
|
|||||||
@@ -1,32 +1,60 @@
|
|||||||
import { GraphNode } from "@langchain/langgraph";
|
import { GraphNode } from "@langchain/langgraph";
|
||||||
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
||||||
import { logger } from "../utils/logger";
|
import { logger } from "../utils/logger";
|
||||||
import { queryScraper } from "../tools/webSearch";
|
import { jsonrepair } from 'jsonrepair';
|
||||||
import { rankAndDisplayData } from "../tools/triggerEventTools";
|
|
||||||
|
function extractJSON(text: string) {
|
||||||
|
const match = text.match(/<json>([\s\S]*?)<\/json>/);
|
||||||
|
if (!match) throw new Error("No JSON found between <json> tags");
|
||||||
|
return match[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||||
//this is kinda doing two things, but having two nodes for it seems overkill
|
|
||||||
|
|
||||||
if (state.proposedTriggerEvent == undefined) {
|
if (state.proposedTriggerEvent == undefined) {
|
||||||
logger.warn("No trigger events in memory, parsing")
|
logger.warn("No trigger events in memory, parsing");
|
||||||
|
|
||||||
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
const genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
||||||
const parsed = ProposedTriggerEventArray.parse(JSON.parse(genResponse));
|
|
||||||
|
|
||||||
for (let i = 0; i < parsed.length; i++) {
|
let repaired: string;
|
||||||
const search = parsed[i].SearchQuery
|
try {
|
||||||
// const data = await queryScraper(search);
|
let extracted = extractJSON(genResponse)
|
||||||
// const output = await rankAndDisplayData(data, search);
|
repaired = jsonrepair(extracted);
|
||||||
|
} catch (repairErr: any) {
|
||||||
|
logger.error("Failed to repair JSON from LLM response.");
|
||||||
|
logger.error("Original LLM response:\n%s", genResponse);
|
||||||
|
throw new Error(`JSON repair failed: ${repairErr.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
// parsed[i].context = output;
|
let parsed;
|
||||||
parsed[i].context = "NONE"
|
try {
|
||||||
|
const json = JSON.parse(repaired);
|
||||||
|
|
||||||
|
if (Array.isArray(json)) {
|
||||||
|
parsed = ProposedTriggerEventArray.parse(json);
|
||||||
|
} else {
|
||||||
|
// try grab first value
|
||||||
|
const firstValue = Object.values(json)[0];
|
||||||
|
|
||||||
|
if (Array.isArray(firstValue)) {
|
||||||
|
parsed = ProposedTriggerEventArray.parse(firstValue);
|
||||||
|
} else {
|
||||||
|
logger.error("No array found in JSON after parsing.");
|
||||||
|
logger.error("Repaired JSON:\n%s", repaired);
|
||||||
|
logger.error("Original LLM response:\n%s", genResponse);
|
||||||
|
throw new Error("No array found in JSON structure");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (parseErr: any) {
|
||||||
|
logger.error("Failed to parse LLM response to JSON or validate array.");
|
||||||
|
logger.error("Repaired JSON:\n%s", repaired);
|
||||||
|
logger.error("Original LLM response:\n%s", genResponse);
|
||||||
|
throw new Error(`Parsing failed: ${parseErr.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
||||||
}
|
} else {
|
||||||
else {
|
logger.info("Trigger event index %s", state.proposedTriggerEventIndex + 1);
|
||||||
logger.info("Trigger event index %s", state.proposedTriggerEventIndex+1)
|
|
||||||
|
|
||||||
return { proposedTriggerEvent: state.proposedTriggerEvent, proposedTriggerEventIndex: state.proposedTriggerEventIndex+1 };
|
return { proposedTriggerEvent: state.proposedTriggerEvent, proposedTriggerEventIndex: state.proposedTriggerEventIndex + 1 };
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Generated
+392
-357
File diff suppressed because it is too large
Load Diff
@@ -17,6 +17,7 @@
|
|||||||
"@langchain/core": "^1.1.17",
|
"@langchain/core": "^1.1.17",
|
||||||
"@langchain/langgraph": "^1.1.2",
|
"@langchain/langgraph": "^1.1.2",
|
||||||
"@langchain/langgraph-sdk": "^1.5.5",
|
"@langchain/langgraph-sdk": "^1.5.5",
|
||||||
|
"@langchain/ollama": "^1.2.6",
|
||||||
"@langchain/openai": "^1.2.3",
|
"@langchain/openai": "^1.2.3",
|
||||||
"axios": "^1.13.5",
|
"axios": "^1.13.5",
|
||||||
"compute-cosine-similarity": "^1.1.0",
|
"compute-cosine-similarity": "^1.1.0",
|
||||||
@@ -24,6 +25,7 @@
|
|||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"exponential-backoff": "^3.1.3",
|
"exponential-backoff": "^3.1.3",
|
||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
|
"jsonrepair": "^3.13.3",
|
||||||
"langchain": "^1.2.14",
|
"langchain": "^1.2.14",
|
||||||
"selenium-webdriver": "^4.40.0",
|
"selenium-webdriver": "^4.40.0",
|
||||||
"tldts": "^7.0.23",
|
"tldts": "^7.0.23",
|
||||||
|
|||||||
@@ -16,4 +16,7 @@ Relevent examples are included in preceeding messages, use these as exact inspir
|
|||||||
The claim to normalize is:
|
The claim to normalize is:
|
||||||
###TITLE###
|
###TITLE###
|
||||||
|
|
||||||
Produce no other text other than the condensed claim.
|
Produce no other text other than the condensed claim, surrounded <norm></norm>
|
||||||
|
|
||||||
|
For example: BREAKING: the sky is green!
|
||||||
|
Becomes: <norm>The sky is green</norm>
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
Could the following real-world event:
|
|
||||||
###TECLAIM###
|
|
||||||
|
|
||||||
Be a trigger for the following disinformation:
|
|
||||||
###TITLE###
|
|
||||||
|
|
||||||
Respond with "RELATION", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
|
||||||
|
|
||||||
Ignore wether the event happened or not, purely consider the likiness of causation
|
|
||||||
@@ -3,9 +3,10 @@ Once the information has been created as below, a dataset can be created to feed
|
|||||||
|
|
||||||
There is a false disinformation claim circulating:
|
There is a false disinformation claim circulating:
|
||||||
###NTITLE###
|
###NTITLE###
|
||||||
Produce up-to 5 specific events that happened that have led to the spread of this disinformation.
|
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
|
||||||
|
|
||||||
Remember the time frame of the disinformation campaign: ###CDATE###
|
Remember the time frame of the disinformation campaign: ###CDATE###
|
||||||
|
Include no information or events that would not have been available at the time.
|
||||||
|
|
||||||
Produce no more text other than the json.
|
Produce no more text other than the json.
|
||||||
|
|
||||||
@@ -13,8 +14,29 @@ Include a concise but specific search query that can be looked up on a search en
|
|||||||
|
|
||||||
Include a url to a source for your trigger event (not a web search, a specific url from a reputuable source). Do not use OAI cite, include url as text in response.
|
Include a url to a source for your trigger event (not a web search, a specific url from a reputuable source). Do not use OAI cite, include url as text in response.
|
||||||
|
|
||||||
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url".
|
Include the date that the event happened ("March 2022" for exmaple)
|
||||||
|
|
||||||
|
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url,Date".
|
||||||
|
Return ONLY JSON, no extra text. Wrap it like this:
|
||||||
|
<json>
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"Event": "Example"
|
||||||
|
...
|
||||||
|
}
|
||||||
|
]
|
||||||
|
</json>
|
||||||
|
|
||||||
Multiple tool invocations should be requested at once, if applicable.
|
Multiple tool invocations should be requested at once, if applicable.
|
||||||
|
Use your abilities to look between the lines and produce some insightful analysis, thinking both short and long term.
|
||||||
|
|
||||||
|
Events will be reordered as part of processing, each statement must stand alone
|
||||||
|
|
||||||
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
||||||
|
Analysis should only be completed for proposed events that would graner >0.7 points
|
||||||
|
|
||||||
|
Since URLs change frequently, use tools to retreive up to date informaiton everytime, provided examples or existing knowledge will be wrong or out of date.
|
||||||
|
|
||||||
|
Remember to return just json enclosed by <json></json>
|
||||||
|
|
||||||
|
Lets go through it step by step
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
Do the search results cited below
|
|
||||||
###TESEARCH###
|
|
||||||
Support the idea that the following happened:
|
|
||||||
###TECLAIM###
|
|
||||||
|
|
||||||
Respond with "CONFIDENCE", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
|
||||||
|
|
||||||
Dates can be off by a few days, that would still be valid
|
|
||||||
@@ -9,6 +9,7 @@ export const ProposedTriggerEvent = z.object({
|
|||||||
ReasoningWhyRelevant: z.string(),
|
ReasoningWhyRelevant: z.string(),
|
||||||
SearchQuery: z.string(),
|
SearchQuery: z.string(),
|
||||||
Url: z.url(),
|
Url: z.url(),
|
||||||
|
Date: z.string(),
|
||||||
context: z.string().optional(),
|
context: z.string().optional(),
|
||||||
score: z.number().optional()
|
score: z.number().optional()
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ export async function evaluateWithEnsemble({
|
|||||||
answer: string;
|
answer: string;
|
||||||
method: string
|
method: string
|
||||||
}): Promise<{ validProb: number; invalidProb: number; }> {
|
}): Promise<{ validProb: number; invalidProb: number; }> {
|
||||||
const res = await axios.post("http://localhost:8000/evaluate", {
|
const res = await axios.post(process.env.RANKING_URL ?? "http://localhost:8000/evaluate", {
|
||||||
answer,
|
answer,
|
||||||
method
|
method
|
||||||
}, {timeout: 0});
|
}, {timeout: 0});
|
||||||
@@ -18,11 +18,15 @@ export async function evaluateWithEnsemble({
|
|||||||
return {validProb, invalidProb};
|
return {validProb, invalidProb};
|
||||||
}
|
}
|
||||||
|
|
||||||
// let res = await evaluateWithRoberta({answer: "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in March–August 2020)"});
|
// import dotenv from "dotenv";
|
||||||
|
|
||||||
|
// dotenv.config();
|
||||||
|
|
||||||
|
// let res = await evaluateWithEnsemble({method:"flan" ,answer: "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in March–August 2020)"});
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
|
|
||||||
// res = await evaluateWithRoberta({answer: "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film."});
|
// res = await evaluateWithEnsemble({method:"roberta" ,answer: "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film."});
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
|
|
||||||
// res = await evaluateWithRoberta({answer: "The COVID-19 Pandemic"});
|
// res = await evaluateWithEnsemble({method:"logreg" ,answer: "The COVID-19 Pandemic"});
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
import axios from "axios";
|
|
||||||
|
|
||||||
export async function evaluateWithRagas({
|
|
||||||
question,
|
|
||||||
answer,
|
|
||||||
contexts,
|
|
||||||
}: {
|
|
||||||
question: string;
|
|
||||||
answer: string;
|
|
||||||
contexts: string[];
|
|
||||||
}) {
|
|
||||||
const res = await axios.post("http://localhost:8001/evaluate", {
|
|
||||||
question,
|
|
||||||
answer,
|
|
||||||
contexts,
|
|
||||||
});
|
|
||||||
|
|
||||||
return res.data;
|
|
||||||
}
|
|
||||||
|
|
||||||
// let res = await evaluateWithRagas({question: "Who was Bill Nye", answer: "Bill Nye was a Scientist", contexts: ["Bill nye was a Scientist"]});
|
|
||||||
// console.log(res)
|
|
||||||
@@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json";
|
|||||||
|
|
||||||
const JSONL_PATH = "../data/input.jsonl"
|
const JSONL_PATH = "../data/input.jsonl"
|
||||||
|
|
||||||
|
const BM25_MIN_DOCS = 3;
|
||||||
|
|
||||||
type EmbeddingCache = {
|
type EmbeddingCache = {
|
||||||
rawtexts: string[];
|
rawtexts: string[];
|
||||||
cleantexts: string[];
|
cleantexts: string[];
|
||||||
@@ -287,8 +289,20 @@ async function embedText(text: string): Promise<number[]> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function buildBM25(texts: string[]) {
|
function buildBM25(texts: string[]) {
|
||||||
logger.info("Building BM25 index (%s docs)...", texts.length);
|
let paddedTexts = texts;
|
||||||
|
|
||||||
|
if (texts.length < BM25_MIN_DOCS) {
|
||||||
|
const needed = BM25_MIN_DOCS - texts.length;
|
||||||
|
logger.error(
|
||||||
|
"Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)",
|
||||||
|
texts.length,
|
||||||
|
BM25_MIN_DOCS,
|
||||||
|
needed
|
||||||
|
);
|
||||||
|
paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")];
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Building BM25 index (%s docs)...", paddedTexts.length);
|
||||||
const bm25 = bm25Factory();
|
const bm25 = bm25Factory();
|
||||||
|
|
||||||
bm25.defineConfig({
|
bm25.defineConfig({
|
||||||
@@ -302,7 +316,7 @@ function buildBM25(texts: string[]) {
|
|||||||
nlp.tokens.removeWords,
|
nlp.tokens.removeWords,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
texts.forEach((text, i) => {
|
paddedTexts.forEach((text, i) => {
|
||||||
bm25.addDoc({ text }, i);
|
bm25.addDoc({ text }, i);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,32 +1,95 @@
|
|||||||
import { Builder, Browser } from "selenium-webdriver";
|
import { Builder, Browser } from "selenium-webdriver";
|
||||||
import firefox from "selenium-webdriver/firefox";
|
import firefox from "selenium-webdriver/firefox";
|
||||||
|
import { backOff } from "exponential-backoff";
|
||||||
|
import { logger } from "../utils/logger";
|
||||||
|
|
||||||
export async function extractWebpageContent(url: string) : Promise<string[]>{
|
export async function extractWebpageContent(url: string): Promise<string[]> {
|
||||||
|
try {
|
||||||
|
const response = await backOff(async () => {
|
||||||
|
return await extractWebpageContentWorker(url);
|
||||||
|
}, {
|
||||||
|
numOfAttempts: 10,
|
||||||
|
startingDelay: 500,
|
||||||
|
timeMultiple: 2,
|
||||||
|
jitter: "full",
|
||||||
|
maxDelay: 50000,
|
||||||
|
});
|
||||||
|
return response;
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
|
||||||
|
return ["API EXCEPTION"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractWebpageContentWorker(url: string): Promise<string[]> {
|
||||||
|
let driver;
|
||||||
|
try {
|
||||||
const options = new firefox.Options();
|
const options = new firefox.Options();
|
||||||
options.addArguments("--headless");
|
options.addArguments("--headless");
|
||||||
|
options.addArguments("--disable-gpu");
|
||||||
|
options.addArguments("--no-sandbox"); // Linux sandbox issues
|
||||||
|
options.addArguments("--disable-dev-shm-usage"); // /dev/shm issues
|
||||||
|
driver = await new Builder()
|
||||||
|
.forBrowser(Browser.FIREFOX)
|
||||||
|
.setFirefoxOptions(options)
|
||||||
|
.build();
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to launch Firefox driver: ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
try {
|
||||||
|
await driver.get(url);
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to navigate to URL "${url}": ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
|
|
||||||
try {
|
try {
|
||||||
await driver.get(url)
|
|
||||||
await driver.wait(async () => {
|
await driver.wait(async () => {
|
||||||
return await driver.executeScript(
|
return await driver.executeScript(
|
||||||
"return document.readyState === 'complete'"
|
"return document.readyState === 'complete'"
|
||||||
);
|
);
|
||||||
}, 5000);
|
}, 5000);
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
|
||||||
|
// do not throw, attempt to read
|
||||||
|
}
|
||||||
|
|
||||||
const readableText = await driver.executeScript(
|
let readableText: string;
|
||||||
|
try {
|
||||||
|
readableText = await driver.executeScript(
|
||||||
"return document.body.innerText;"
|
"return document.body.innerText;"
|
||||||
) as string;
|
) as string;
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to extract page text from "${url}": ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
const filteredLines = readableText
|
const filteredLines = readableText
|
||||||
.split(/\r?\n/)
|
.split(/\r?\n/)
|
||||||
.map(line => line.trim())
|
.map(line => line.trim())
|
||||||
.filter(line => line.split(/\s+/).length > 1);
|
.filter(line => line.split(/\s+/).length > 1);
|
||||||
|
|
||||||
|
if (filteredLines.length === 0) {
|
||||||
|
const desc = `No content extracted from "${url}"`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
return filteredLines;
|
return filteredLines;
|
||||||
} finally {
|
} finally {
|
||||||
await driver.quit()
|
try {
|
||||||
|
await driver.quit();
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||||
|
// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
|
||||||
@@ -92,7 +92,7 @@ LABEL_TO_INT = {v: k for k, v in INT_TO_LABEL.items()}
|
|||||||
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_PATH)
|
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_PATH)
|
||||||
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PATH)
|
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PATH)
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cpu")
|
||||||
flan_model.to(device)
|
flan_model.to(device)
|
||||||
flan_model.eval()
|
flan_model.eval()
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ datasets
|
|||||||
# ROBERTA
|
# ROBERTA
|
||||||
scikit-learn
|
scikit-learn
|
||||||
transformers[torch]
|
transformers[torch]
|
||||||
|
sentence_transformers
|
||||||
|
|
||||||
# Utils
|
# Utils
|
||||||
numpy
|
numpy
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ const AGENT_NAME = process.env.AGENT ?? "agent";
|
|||||||
*/
|
*/
|
||||||
const MODE = process.env.MODE ?? "claim";
|
const MODE = process.env.MODE ?? "claim";
|
||||||
|
|
||||||
const MAX_CONCURRENCY = 5;
|
const MAX_CONCURRENCY = 1;
|
||||||
|
|
||||||
const client = new Client({ apiUrl: API_URL });
|
const client = new Client({ apiUrl: API_URL });
|
||||||
|
|
||||||
@@ -118,7 +118,7 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
|||||||
input: buildAgentInput(record),
|
input: buildAgentInput(record),
|
||||||
streamMode: "values",
|
streamMode: "values",
|
||||||
config: {
|
config: {
|
||||||
recursion_limit: 50
|
recursion_limit: 100
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user