Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fd0674e96a | |||
| 5e374a8bd6 | |||
| fbc688b8f9 |
@@ -3,9 +3,10 @@ Once the information has been created as below, a dataset can be created to feed
|
|||||||
|
|
||||||
There is a false disinformation claim circulating:
|
There is a false disinformation claim circulating:
|
||||||
###NTITLE###
|
###NTITLE###
|
||||||
Produce up-to 5 specific events that happened that have led to the spread of this disinformation.
|
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
|
||||||
|
|
||||||
Remember the time frame of the disinformation campaign: ###CDATE###
|
Remember the time frame of the disinformation campaign: ###CDATE###
|
||||||
|
Include no information or events that would not have been available at the time.
|
||||||
|
|
||||||
Produce no more text other than the json.
|
Produce no more text other than the json.
|
||||||
|
|
||||||
@@ -13,8 +14,20 @@ Include a concise but specific search query that can be looked up on a search en
|
|||||||
|
|
||||||
Include a url to a source for your trigger event (not a web search, a specific url from a reputuable source). Do not use OAI cite, include url as text in response.
|
Include a url to a source for your trigger event (not a web search, a specific url from a reputuable source). Do not use OAI cite, include url as text in response.
|
||||||
|
|
||||||
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url".
|
Include the date that the event happened ("March 2022" for exmaple)
|
||||||
|
|
||||||
|
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url,Date".
|
||||||
|
|
||||||
Multiple tool invocations should be requested at once, if applicable.
|
Multiple tool invocations should be requested at once, if applicable.
|
||||||
|
Use your abilities to look between the lines and produce some insightful analysis, thinking both short and long term.
|
||||||
|
|
||||||
|
Events will be reordered as part of processing, each statement must stand alone
|
||||||
|
|
||||||
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
||||||
|
Analysis should only be completed for proposed events that would graner >0.7 points
|
||||||
|
|
||||||
|
First, consider a range of directions in which the proposed disinformation could have been influenced by.
|
||||||
|
Then, research these directions in turn, using the tools at hand.
|
||||||
|
Finally, refine your proposed "trigger event" until it is specific, quantifiable and backed up by evidence.
|
||||||
|
|
||||||
|
Lets go through it step by step
|
||||||
@@ -9,6 +9,7 @@ export const ProposedTriggerEvent = z.object({
|
|||||||
ReasoningWhyRelevant: z.string(),
|
ReasoningWhyRelevant: z.string(),
|
||||||
SearchQuery: z.string(),
|
SearchQuery: z.string(),
|
||||||
Url: z.url(),
|
Url: z.url(),
|
||||||
|
Date: z.string(),
|
||||||
context: z.string().optional(),
|
context: z.string().optional(),
|
||||||
score: z.number().optional()
|
score: z.number().optional()
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json";
|
|||||||
|
|
||||||
const JSONL_PATH = "../data/input.jsonl"
|
const JSONL_PATH = "../data/input.jsonl"
|
||||||
|
|
||||||
|
const BM25_MIN_DOCS = 3;
|
||||||
|
|
||||||
type EmbeddingCache = {
|
type EmbeddingCache = {
|
||||||
rawtexts: string[];
|
rawtexts: string[];
|
||||||
cleantexts: string[];
|
cleantexts: string[];
|
||||||
@@ -287,8 +289,20 @@ async function embedText(text: string): Promise<number[]> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function buildBM25(texts: string[]) {
|
function buildBM25(texts: string[]) {
|
||||||
logger.info("Building BM25 index (%s docs)...", texts.length);
|
let paddedTexts = texts;
|
||||||
|
|
||||||
|
if (texts.length < BM25_MIN_DOCS) {
|
||||||
|
const needed = BM25_MIN_DOCS - texts.length;
|
||||||
|
logger.error(
|
||||||
|
"Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)",
|
||||||
|
texts.length,
|
||||||
|
BM25_MIN_DOCS,
|
||||||
|
needed
|
||||||
|
);
|
||||||
|
paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")];
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Building BM25 index (%s docs)...", paddedTexts.length);
|
||||||
const bm25 = bm25Factory();
|
const bm25 = bm25Factory();
|
||||||
|
|
||||||
bm25.defineConfig({
|
bm25.defineConfig({
|
||||||
@@ -302,7 +316,7 @@ function buildBM25(texts: string[]) {
|
|||||||
nlp.tokens.removeWords,
|
nlp.tokens.removeWords,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
texts.forEach((text, i) => {
|
paddedTexts.forEach((text, i) => {
|
||||||
bm25.addDoc({ text }, i);
|
bm25.addDoc({ text }, i);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,32 +1,92 @@
|
|||||||
import { Builder, Browser } from "selenium-webdriver";
|
import { Builder, Browser } from "selenium-webdriver";
|
||||||
import firefox from "selenium-webdriver/firefox";
|
import firefox from "selenium-webdriver/firefox";
|
||||||
|
import { backOff } from "exponential-backoff";
|
||||||
|
import { logger } from "../utils/logger";
|
||||||
|
|
||||||
export async function extractWebpageContent(url: string): Promise<string[]> {
|
export async function extractWebpageContent(url: string): Promise<string[]> {
|
||||||
|
try {
|
||||||
|
const response = await backOff(async () => {
|
||||||
|
return await extractWebpageContentWorker(url);
|
||||||
|
}, {
|
||||||
|
numOfAttempts: 10,
|
||||||
|
startingDelay: 500,
|
||||||
|
timeMultiple: 2,
|
||||||
|
jitter: "full",
|
||||||
|
maxDelay: 50000,
|
||||||
|
});
|
||||||
|
return response;
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
|
||||||
|
return ["API EXCEPTION"];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function extractWebpageContentWorker(url: string): Promise<string[]> {
|
||||||
|
let driver;
|
||||||
|
try {
|
||||||
const options = new firefox.Options();
|
const options = new firefox.Options();
|
||||||
options.addArguments("--headless");
|
options.addArguments("--headless");
|
||||||
|
driver = await new Builder()
|
||||||
|
.forBrowser(Browser.FIREFOX)
|
||||||
|
.setFirefoxOptions(options)
|
||||||
|
.build();
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to launch Firefox driver: ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
try {
|
||||||
|
await driver.get(url);
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to navigate to URL "${url}": ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
|
|
||||||
try {
|
try {
|
||||||
await driver.get(url)
|
|
||||||
await driver.wait(async () => {
|
await driver.wait(async () => {
|
||||||
return await driver.executeScript(
|
return await driver.executeScript(
|
||||||
"return document.readyState === 'complete'"
|
"return document.readyState === 'complete'"
|
||||||
);
|
);
|
||||||
}, 5000);
|
}, 5000);
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
|
||||||
|
// do not throw, attempt to read
|
||||||
|
}
|
||||||
|
|
||||||
const readableText = await driver.executeScript(
|
let readableText: string;
|
||||||
|
try {
|
||||||
|
readableText = await driver.executeScript(
|
||||||
"return document.body.innerText;"
|
"return document.body.innerText;"
|
||||||
) as string;
|
) as string;
|
||||||
|
} catch (err: any) {
|
||||||
|
const desc = `Failed to extract page text from "${url}": ${err.message}`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
const filteredLines = readableText
|
const filteredLines = readableText
|
||||||
.split(/\r?\n/)
|
.split(/\r?\n/)
|
||||||
.map(line => line.trim())
|
.map(line => line.trim())
|
||||||
.filter(line => line.split(/\s+/).length > 1);
|
.filter(line => line.split(/\s+/).length > 1);
|
||||||
|
|
||||||
|
if (filteredLines.length === 0) {
|
||||||
|
const desc = `No content extracted from "${url}"`;
|
||||||
|
logger.error(desc);
|
||||||
|
throw new Error(desc);
|
||||||
|
}
|
||||||
|
|
||||||
return filteredLines;
|
return filteredLines;
|
||||||
} finally {
|
} finally {
|
||||||
await driver.quit()
|
try {
|
||||||
|
await driver.quit();
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||||
|
// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
|
||||||
@@ -118,7 +118,7 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
|||||||
input: buildAgentInput(record),
|
input: buildAgentInput(record),
|
||||||
streamMode: "values",
|
streamMode: "values",
|
||||||
config: {
|
config: {
|
||||||
recursion_limit: 50
|
recursion_limit: 100
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user