Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e0bab9897 | |||
| c4dac3f515 | |||
| 2252a42466 | |||
| 75ca1032a6 | |||
| 00d129bd28 | |||
| cf923d6e87 | |||
| f821e9643d | |||
| 43ecd04135 | |||
| 8c0921057b | |||
| b610e8c989 | |||
| f8d4155b7c | |||
| 5e374a8bd6 |
@@ -1,9 +1,22 @@
|
||||
# AI models for identifying trigger events in disinformation analysis
|
||||
Final Dissertation Submission Repository
|
||||
|
||||
## Project Description
|
||||
## Abstract
|
||||
-- todo --
|
||||
|
||||
[Project Presentation](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/presentation)
|
||||
|
||||
## Generated Database Link and Usage Experiments
|
||||
Generated Dataset Link: [https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset](https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset)
|
||||
|
||||
Graph-Based Dataset Visualisation: [https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/)
|
||||
|
||||
Usage Experiments (incl graph visualisation) Source Code: [https://github.com/WillJeynes/LLMsForDisinformationPrediction](https://github.com/WillJeynes/LLMsForDisinformationPrediction)
|
||||
|
||||
|
||||
|
||||
# This repository:
|
||||
|
||||
## Solution Diagram
|
||||
-- todo --
|
||||
|
||||
@@ -13,8 +26,6 @@ Final Dissertation Submission Repository
|
||||
## Agent Refinement
|
||||
[See agent](/agent/)
|
||||
|
||||
## Generated Database Link and Usage Experiments
|
||||
-- todo --
|
||||
|
||||
## Repository Structure
|
||||
```
|
||||
|
||||
+30
-1
@@ -1,3 +1,32 @@
|
||||
## Refining the agent output
|
||||
|
||||
TODO: Table and document experiments
|
||||
Experiments modifying pipeline
|
||||
|
||||
| Model | % Correct | % Change |
|
||||
|------------------|----------:|---------:|
|
||||
| BASELINE | 33 | 0 |
|
||||
| Improv Prompt | 39.96 | 0.21 |
|
||||
| Add Examples | 44.67 | 0.35 |
|
||||
| Date | 45.51 | 0.38 |
|
||||
| Chain of Thought | 43.38 | 0.31 |
|
||||
| Self-Critique | 44.36 | 0.34 |
|
||||
|
||||
Experiments with different model types:
|
||||
| Model | % Correct | % Change |
|
||||
|-------------------------------|----------:|---------:|
|
||||
| gpt-5-mini | 45.51 | |
|
||||
| gpt-5.4-mini | 32.4 | |
|
||||
| gpt-5.4-nano | 23.28 | |
|
||||
| gpt-4.1-mini | 27.85 | |
|
||||
| gpt-4o-mini | 32.47 | |
|
||||
| llama3.1:8b-instruct-q4_K_M | ? | |
|
||||
| qwen3.5:9b | 0 | |
|
||||
|
||||
%age valid URLS
|
||||
| Model | Number | % Age |
|
||||
|-------------------------------|----------:|---------:|
|
||||
| gpt-5-mini | 22/405 | 5.43 |
|
||||
| gpt-5.4-mini | 29/278 | 10.43 |
|
||||
| gpt-5.4-nano | 6/210 | 2.85 |
|
||||
| gpt-4.1-mini | 15/269 | 5.57 |
|
||||
| gpt-4o-mini | 27/287 | 9.407 |
|
||||
@@ -1,8 +1,7 @@
|
||||
import { GraphNode } from "@langchain/langgraph";
|
||||
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
||||
import { logger } from "../utils/logger";
|
||||
import { queryScraper } from "../tools/webSearch";
|
||||
import { rankAndDisplayData } from "../tools/triggerEventTools";
|
||||
import { jsonrepair } from 'jsonrepair'
|
||||
|
||||
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||
//this is kinda doing two things, but having two nodes for it seems overkill
|
||||
@@ -11,15 +10,29 @@ export const verificationSetup: GraphNode<typeof MessagesState> = async (state)
|
||||
logger.warn("No trigger events in memory, parsing")
|
||||
|
||||
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
||||
const parsed = ProposedTriggerEventArray.parse(JSON.parse(genResponse));
|
||||
|
||||
for (let i = 0; i < parsed.length; i++) {
|
||||
const search = parsed[i].SearchQuery
|
||||
// const data = await queryScraper(search);
|
||||
// const output = await rankAndDisplayData(data, search);
|
||||
const repaired = jsonrepair(genResponse);
|
||||
|
||||
// parsed[i].context = output;
|
||||
parsed[i].context = "NONE"
|
||||
let parsed;
|
||||
|
||||
try {
|
||||
const json = JSON.parse(repaired);
|
||||
|
||||
if (Array.isArray(json)) {
|
||||
parsed = ProposedTriggerEventArray.parse(json);
|
||||
} else {
|
||||
// try grab first value
|
||||
const firstValue = Object.values(json)[0];
|
||||
|
||||
if (Array.isArray(firstValue)) {
|
||||
parsed = ProposedTriggerEventArray.parse(firstValue);
|
||||
} else {
|
||||
throw new Error("No array found in JSON");
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
logger.error(`Failed to parse LLM response: ${err.message}`);
|
||||
throw new Error(`Failed to parse LLM response: ${err}`);
|
||||
}
|
||||
|
||||
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
||||
|
||||
Generated
+10
@@ -20,6 +20,7 @@
|
||||
"dotenv": "^17.2.3",
|
||||
"exponential-backoff": "^3.1.3",
|
||||
"fs": "^0.0.1-security",
|
||||
"jsonrepair": "^3.13.3",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"tldts": "^7.0.23",
|
||||
@@ -2075,6 +2076,15 @@
|
||||
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/jsonrepair": {
|
||||
"version": "3.13.3",
|
||||
"resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.3.tgz",
|
||||
"integrity": "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ==",
|
||||
"license": "ISC",
|
||||
"bin": {
|
||||
"jsonrepair": "bin/cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jszip": {
|
||||
"version": "3.10.1",
|
||||
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
"dotenv": "^17.2.3",
|
||||
"exponential-backoff": "^3.1.3",
|
||||
"fs": "^0.0.1-security",
|
||||
"jsonrepair": "^3.13.3",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"tldts": "^7.0.23",
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
Could the following real-world event:
|
||||
###TECLAIM###
|
||||
|
||||
Be a trigger for the following disinformation:
|
||||
###TITLE###
|
||||
|
||||
Respond with "RELATION", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
||||
|
||||
Ignore wether the event happened or not, purely consider the likiness of causation
|
||||
@@ -1,8 +0,0 @@
|
||||
Do the search results cited below
|
||||
###TESEARCH###
|
||||
Support the idea that the following happened:
|
||||
###TECLAIM###
|
||||
|
||||
Respond with "CONFIDENCE", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
||||
|
||||
Dates can be off by a few days, that would still be valid
|
||||
@@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json";
|
||||
|
||||
const JSONL_PATH = "../data/input.jsonl"
|
||||
|
||||
const BM25_MIN_DOCS = 3;
|
||||
|
||||
type EmbeddingCache = {
|
||||
rawtexts: string[];
|
||||
cleantexts: string[];
|
||||
@@ -287,8 +289,20 @@ async function embedText(text: string): Promise<number[]> {
|
||||
}
|
||||
|
||||
function buildBM25(texts: string[]) {
|
||||
logger.info("Building BM25 index (%s docs)...", texts.length);
|
||||
let paddedTexts = texts;
|
||||
|
||||
if (texts.length < BM25_MIN_DOCS) {
|
||||
const needed = BM25_MIN_DOCS - texts.length;
|
||||
logger.error(
|
||||
"Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)",
|
||||
texts.length,
|
||||
BM25_MIN_DOCS,
|
||||
needed
|
||||
);
|
||||
paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")];
|
||||
}
|
||||
|
||||
logger.info("Building BM25 index (%s docs)...", paddedTexts.length);
|
||||
const bm25 = bm25Factory();
|
||||
|
||||
bm25.defineConfig({
|
||||
@@ -302,7 +316,7 @@ function buildBM25(texts: string[]) {
|
||||
nlp.tokens.removeWords,
|
||||
]);
|
||||
|
||||
texts.forEach((text, i) => {
|
||||
paddedTexts.forEach((text, i) => {
|
||||
bm25.addDoc({ text }, i);
|
||||
});
|
||||
|
||||
|
||||
+87
-27
@@ -1,32 +1,92 @@
|
||||
import { Builder, Browser } from "selenium-webdriver";
|
||||
import firefox from "selenium-webdriver/firefox";
|
||||
import { backOff } from "exponential-backoff";
|
||||
import { logger } from "../utils/logger";
|
||||
|
||||
export async function extractWebpageContent(url: string) : Promise<string[]>{
|
||||
const options = new firefox.Options();
|
||||
options.addArguments("--headless");
|
||||
|
||||
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
|
||||
try {
|
||||
await driver.get(url)
|
||||
await driver.wait(async () => {
|
||||
return await driver.executeScript(
|
||||
"return document.readyState === 'complete'"
|
||||
);
|
||||
}, 5000);
|
||||
|
||||
const readableText = await driver.executeScript(
|
||||
"return document.body.innerText;"
|
||||
) as string;
|
||||
|
||||
const filteredLines = readableText
|
||||
.split(/\r?\n/)
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.split(/\s+/).length > 1);
|
||||
|
||||
return filteredLines;
|
||||
} finally {
|
||||
await driver.quit()
|
||||
}
|
||||
export async function extractWebpageContent(url: string): Promise<string[]> {
|
||||
try {
|
||||
const response = await backOff(async () => {
|
||||
return await extractWebpageContentWorker(url);
|
||||
}, {
|
||||
numOfAttempts: 10,
|
||||
startingDelay: 500,
|
||||
timeMultiple: 2,
|
||||
jitter: "full",
|
||||
maxDelay: 50000,
|
||||
});
|
||||
return response;
|
||||
} catch (err: any) {
|
||||
logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
|
||||
return ["API EXCEPTION"];
|
||||
}
|
||||
}
|
||||
|
||||
//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||
async function extractWebpageContentWorker(url: string): Promise<string[]> {
|
||||
let driver;
|
||||
try {
|
||||
const options = new firefox.Options();
|
||||
options.addArguments("--headless");
|
||||
driver = await new Builder()
|
||||
.forBrowser(Browser.FIREFOX)
|
||||
.setFirefoxOptions(options)
|
||||
.build();
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to launch Firefox driver: ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
try {
|
||||
try {
|
||||
await driver.get(url);
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to navigate to URL "${url}": ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
try {
|
||||
await driver.wait(async () => {
|
||||
return await driver.executeScript(
|
||||
"return document.readyState === 'complete'"
|
||||
);
|
||||
}, 5000);
|
||||
} catch (err: any) {
|
||||
logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
|
||||
// do not throw, attempt to read
|
||||
}
|
||||
|
||||
let readableText: string;
|
||||
try {
|
||||
readableText = await driver.executeScript(
|
||||
"return document.body.innerText;"
|
||||
) as string;
|
||||
} catch (err: any) {
|
||||
const desc = `Failed to extract page text from "${url}": ${err.message}`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
const filteredLines = readableText
|
||||
.split(/\r?\n/)
|
||||
.map(line => line.trim())
|
||||
.filter(line => line.split(/\s+/).length > 1);
|
||||
|
||||
if (filteredLines.length === 0) {
|
||||
const desc = `No content extracted from "${url}"`;
|
||||
logger.error(desc);
|
||||
throw new Error(desc);
|
||||
}
|
||||
|
||||
return filteredLines;
|
||||
} finally {
|
||||
try {
|
||||
await driver.quit();
|
||||
} catch (err: any) {
|
||||
logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||
// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
|
||||
@@ -5,7 +5,7 @@ set -e
|
||||
run_agent () {
|
||||
echo "Starting LangGraph agent..."
|
||||
cd agent
|
||||
npx @langchain/langgraph-cli dev
|
||||
npx @langchain/langgraph-cli@1.1.17 dev
|
||||
}
|
||||
|
||||
run_ensemble_service () {
|
||||
|
||||
@@ -9,6 +9,7 @@ datasets
|
||||
# ROBERTA
|
||||
scikit-learn
|
||||
transformers[torch]
|
||||
sentence_transformers
|
||||
|
||||
# Utils
|
||||
numpy
|
||||
|
||||
@@ -19,6 +19,9 @@ const MODE = process.env.MODE ?? "claim";
|
||||
|
||||
const MAX_CONCURRENCY = 5;
|
||||
|
||||
const OFFSET = parseInt(process.env.OFFSET ?? "0", 10);
|
||||
const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null;
|
||||
|
||||
const client = new Client({ apiUrl: API_URL });
|
||||
|
||||
|
||||
@@ -118,7 +121,7 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
||||
input: buildAgentInput(record),
|
||||
streamMode: "values",
|
||||
config: {
|
||||
recursion_limit: 50
|
||||
recursion_limit: 100
|
||||
}
|
||||
});
|
||||
|
||||
@@ -164,10 +167,19 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
||||
async function main() {
|
||||
console.log("Reading input file...");
|
||||
|
||||
const records = await loadInputs();
|
||||
const allRecords = await loadInputs();
|
||||
|
||||
console.log(`Loaded ${records.length} records`);
|
||||
console.log(`Loaded ${allRecords.length} records`);
|
||||
|
||||
const records = allRecords.slice(
|
||||
OFFSET,
|
||||
LIMIT !== null ? OFFSET + LIMIT : undefined
|
||||
);
|
||||
|
||||
console.log(
|
||||
`Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})`
|
||||
);
|
||||
|
||||
fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });
|
||||
|
||||
const limit = pLimit(MAX_CONCURRENCY);
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
import json
|
||||
import argparse
|
||||
from urllib.parse import urlparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
|
||||
from tqdm import tqdm
|
||||
|
||||
def init_driver():
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_argument("--window-size=1920,1080")
|
||||
prefs = {
|
||||
"profile.managed_default_content_settings.images": 2, # block images
|
||||
"profile.default_content_setting_values.stylesheets": 2, # block CSS
|
||||
"profile.managed_default_content_settings.cookies": 2, # optional
|
||||
}
|
||||
options.add_experimental_option("prefs", prefs)
|
||||
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.set_page_load_timeout(30)
|
||||
return driver
|
||||
|
||||
def is_root_url(url):
|
||||
parsed = urlparse(url)
|
||||
return parsed.path in ("", "/")
|
||||
|
||||
def is_404_page(driver):
|
||||
"""Safely check for 404, handling stale elements."""
|
||||
try:
|
||||
title = driver.title.lower()
|
||||
body_text = driver.find_element("tag name", "body").text.lower()
|
||||
return "404" in title or "404" in body_text
|
||||
except StaleElementReferenceException:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def check_url_selenium(url):
|
||||
driver = None
|
||||
try:
|
||||
driver = init_driver()
|
||||
driver.get(url)
|
||||
# 404 check
|
||||
if is_404_page(driver):
|
||||
return False, "404 page detected"
|
||||
# Root URL after redirects
|
||||
final_url = driver.current_url
|
||||
if is_root_url(final_url):
|
||||
return False, f"Redirected to root URL ({final_url})"
|
||||
return True, None
|
||||
except (WebDriverException, TimeoutException) as e:
|
||||
return False, str(e)
|
||||
finally:
|
||||
if driver:
|
||||
driver.quit()
|
||||
|
||||
def process_event(event):
|
||||
"""Process an event only if score > 0.4."""
|
||||
score = event.get("score", 0)
|
||||
if score <= 0.4:
|
||||
return None, False, "Score too low"
|
||||
url = event.get("Url")
|
||||
if not url:
|
||||
return None, False, "No URL"
|
||||
is_valid, error_msg = check_url_selenium(url)
|
||||
event["url_valid"] = is_valid
|
||||
return url, is_valid, error_msg
|
||||
|
||||
def process_jsonl_file(file_path, max_workers=4):
|
||||
invalid_urls = []
|
||||
valid_urls = 0
|
||||
|
||||
# Gather events with score > 0.4
|
||||
urls_to_check = []
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line_data = json.loads(line)
|
||||
if line_data.get("status") != "success":
|
||||
continue
|
||||
for event in line_data.get("events", []):
|
||||
if event.get("score", 0) > 0.4:
|
||||
urls_to_check.append(event)
|
||||
|
||||
total_urls = len(urls_to_check)
|
||||
|
||||
# ThreadPoolExecutor with tqdm progress bar
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
||||
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
||||
url, is_valid, error_msg = future.result()
|
||||
if not is_valid and url:
|
||||
invalid_urls.append((url, error_msg))
|
||||
else:
|
||||
valid_urls += 1
|
||||
|
||||
# Summary
|
||||
if invalid_urls:
|
||||
print("\nList of invalid URLs and reasons:")
|
||||
for url, err in invalid_urls:
|
||||
print(f"{url} --> {err}")
|
||||
print("\n=== URL Validation Summary ===")
|
||||
print(f"Total URLs processed: {total_urls}")
|
||||
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||
parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_jsonl_file(args.file_path, max_workers=args.workers)
|
||||
@@ -27,7 +27,7 @@ DEFAULT_PARAMS = [
|
||||
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
|
||||
]
|
||||
|
||||
NUM_RANDOM_CLAIMS = 200
|
||||
NUM_RANDOM_CLAIMS = 2000
|
||||
|
||||
INPUT_FILE = "../../data/input.jsonl"
|
||||
OUTPUT_FILE = "../../data/claims.json"
|
||||
|
||||
Reference in New Issue
Block a user