Update README, lock langchain CLI to specific version

Remove some very unused prompts
Add database link to README
2026-05-07 18:45:12 +01:00 · 2026-05-03 21:46:54 +01:00 · 2026-04-09 15:46:18 +01:00 · 2026-04-05 22:47:25 +01:00 · 2026-04-05 12:31:09 +01:00 · 2026-04-05 11:51:28 +01:00
14 changed files with 317 additions and 64 deletions
@@ -1,9 +1,22 @@
 # AI models for identifying trigger events in disinformation analysis
 Final Dissertation Submission Repository

-## Project Description
+## Abstract
 -- todo --

+[Project Presentation](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/presentation)
+
+## Generated Database Link and Usage Experiments
+Generated Dataset Link: [https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset](https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset)
+
+Graph-Based Dataset Visualisation: [https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/)
+
+Usage Experiments (incl graph visualisation) Source Code: [https://github.com/WillJeynes/LLMsForDisinformationPrediction](https://github.com/WillJeynes/LLMsForDisinformationPrediction)
+
+
+
+# This repository:
+
 ## Solution Diagram
 -- todo --

@@ -13,8 +26,6 @@ Final Dissertation Submission Repository
 ## Agent Refinement
 [See agent](/agent/)

-## Generated Database Link and Usage Experiments
-- todo --

 ## Repository Structure
 ```
@@ -1,3 +1,32 @@
 ## Refining the agent output

-TODO: Table and document experiments
+Experiments modifying pipeline
+
+| Model            | % Correct | % Change |
+|------------------|----------:|---------:|
+| BASELINE         | 33        | 0        |
+| Improv Prompt    | 39.96     | 0.21     |
+| Add Examples     | 44.67     | 0.35     |
+| Date             | 45.51     | 0.38     |
+| Chain of Thought | 43.38     | 0.31     |
+| Self-Critique    | 44.36     | 0.34     |
+
+Experiments with different model types:
+| Model                         | % Correct | % Change |
+|-------------------------------|----------:|---------:|
+| gpt-5-mini                    | 45.51     |          |
+| gpt-5.4-mini                  | 32.4      |          |
+| gpt-5.4-nano                  | 23.28     |          |
+| gpt-4.1-mini                  | 27.85     |          |
+| gpt-4o-mini                   | 32.47     |          |
+| llama3.1:8b-instruct-q4_K_M   | ?         |          |
+| qwen3.5:9b                    | 0         |          |
+
+%age valid URLS
+| Model                         | Number    | % Age    |
+|-------------------------------|----------:|---------:|
+| gpt-5-mini                    | 22/405    | 5.43     |
+| gpt-5.4-mini                  | 29/278    | 10.43    |
+| gpt-5.4-nano                  | 6/210     | 2.85     |
+| gpt-4.1-mini                  | 15/269    | 5.57     |
+| gpt-4o-mini                   | 27/287    | 9.407    |
@@ -1,8 +1,7 @@
 import { GraphNode } from "@langchain/langgraph";
 import { MessagesState, ProposedTriggerEventArray } from "../state";
 import { logger } from "../utils/logger";
-import { queryScraper } from "../tools/webSearch";
-import { rankAndDisplayData } from "../tools/triggerEventTools";
+import { jsonrepair } from 'jsonrepair'

 export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
  //this is kinda doing two things, but having two nodes for it seems overkill
@@ -11,15 +10,29 @@ export const verificationSetup: GraphNode<typeof MessagesState> = async (state)
    logger.warn("No trigger events in memory, parsing")

    let genResponse = state.messages.at(-1)?.content.toString() ?? "";
-    const parsed = ProposedTriggerEventArray.parse(JSON.parse(genResponse));

-    for (let i = 0; i < parsed.length; i++) {
-      const search = parsed[i].SearchQuery
-      // const data = await queryScraper(search);
-      // const output = await rankAndDisplayData(data, search);
+    const repaired = jsonrepair(genResponse);

-      // parsed[i].context = output;
-      parsed[i].context = "NONE"
+    let parsed;
+
+    try {
+      const json = JSON.parse(repaired);
+
+      if (Array.isArray(json)) {
+        parsed = ProposedTriggerEventArray.parse(json);
+      } else {
+        // try grab first value
+        const firstValue = Object.values(json)[0];
+
+        if (Array.isArray(firstValue)) {
+          parsed = ProposedTriggerEventArray.parse(firstValue);
+        } else {
+          throw new Error("No array found in JSON");
+        }
+      }
+    } catch (err: any) {
+      logger.error(`Failed to parse LLM response: ${err.message}`);
+      throw new Error(`Failed to parse LLM response: ${err}`);
    }
    
    return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
@@ -20,6 +20,7 @@
        "dotenv": "^17.2.3",
        "exponential-backoff": "^3.1.3",
        "fs": "^0.0.1-security",
+        "jsonrepair": "^3.13.3",
        "langchain": "^1.2.14",
        "selenium-webdriver": "^4.40.0",
        "tldts": "^7.0.23",
@@ -2075,6 +2076,15 @@
      "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
      "license": "ISC"
    },
+    "node_modules/jsonrepair": {
+      "version": "3.13.3",
+      "resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.3.tgz",
+      "integrity": "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ==",
+      "license": "ISC",
+      "bin": {
+        "jsonrepair": "bin/cli.js"
+      }
+    },
    "node_modules/jszip": {
      "version": "3.10.1",
      "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
@@ -24,6 +24,7 @@
    "dotenv": "^17.2.3",
    "exponential-backoff": "^3.1.3",
    "fs": "^0.0.1-security",
+    "jsonrepair": "^3.13.3",
    "langchain": "^1.2.14",
    "selenium-webdriver": "^4.40.0",
    "tldts": "^7.0.23",
@@ -1,9 +0,0 @@
-Could the following real-world event:
-###TECLAIM###
-
-Be a trigger for the following disinformation:
-###TITLE###
-
-Respond with "RELATION", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
-
-Ignore wether the event happened or not, purely consider the likiness of causation
@@ -1,8 +0,0 @@
-Do the search results cited below
-###TESEARCH###
-Support the idea that the following happened:
-###TECLAIM###
-
-Respond with "CONFIDENCE", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
-
-Dates can be off by a few days, that would still be valid
@@ -15,6 +15,8 @@ const CACHE_PATH = "../data/csv.cache.json";

 const JSONL_PATH = "../data/input.jsonl"

+const BM25_MIN_DOCS = 3;
+
 type EmbeddingCache = {
  rawtexts: string[];
  cleantexts: string[];
@@ -287,8 +289,20 @@ async function embedText(text: string): Promise<number[]> {
 }

 function buildBM25(texts: string[]) {
-  logger.info("Building BM25 index (%s docs)...", texts.length);
+  let paddedTexts = texts;

+  if (texts.length < BM25_MIN_DOCS) {
+    const needed = BM25_MIN_DOCS - texts.length;
+    logger.error(
+      "Corpus too small for BM25 (%s docs, need %s+), padding with %s dummy doc(s)",
+      texts.length,
+      BM25_MIN_DOCS,
+      needed
+    );
+    paddedTexts = [...texts, ...Array(needed).fill("placeholder dummy document")];
+  }
+
+  logger.info("Building BM25 index (%s docs)...", paddedTexts.length);
  const bm25 = bm25Factory();

  bm25.defineConfig({
@@ -302,7 +316,7 @@ function buildBM25(texts: string[]) {
    nlp.tokens.removeWords,
  ]);

-  texts.forEach((text, i) => {
+  paddedTexts.forEach((text, i) => {
    bm25.addDoc({ text }, i);
  });

@@ -1,32 +1,92 @@
 import { Builder, Browser } from "selenium-webdriver";
 import firefox from "selenium-webdriver/firefox";
+import { backOff } from "exponential-backoff";
+import { logger } from "../utils/logger";

-export async function extractWebpageContent(url: string) : Promise<string[]>{
-    const options = new firefox.Options();
-    options.addArguments("--headless");
-
-    let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
-    try {
-        await driver.get(url)
-        await driver.wait(async () => {
-            return await driver.executeScript(
-                "return document.readyState === 'complete'"
-            );
-        }, 5000);
-
-        const readableText = await driver.executeScript(
-            "return document.body.innerText;"
-        ) as string;
-
-        const filteredLines = readableText
-            .split(/\r?\n/)
-            .map(line => line.trim())
-            .filter(line => line.split(/\s+/).length > 1); 
-        
-        return filteredLines;
-    } finally {
-        await driver.quit()
-    }
+export async function extractWebpageContent(url: string): Promise<string[]> {
+  try {
+    const response = await backOff(async () => {
+      return await extractWebpageContentWorker(url);
+    }, {
+      numOfAttempts: 10,
+      startingDelay: 500,
+      timeMultiple: 2,
+      jitter: "full",
+      maxDelay: 50000,
+    });
+    return response;
+  } catch (err: any) {
+    logger.error(`Failed out of retry loop for URL "${url}", returning placeholder to pipeline`);
+    return ["API EXCEPTION"];
+  }
 }

-//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
+async function extractWebpageContentWorker(url: string): Promise<string[]> {
+  let driver;
+  try {
+    const options = new firefox.Options();
+    options.addArguments("--headless");
+    driver = await new Builder()
+      .forBrowser(Browser.FIREFOX)
+      .setFirefoxOptions(options)
+      .build();
+  } catch (err: any) {
+    const desc = `Failed to launch Firefox driver: ${err.message}`;
+    logger.error(desc);
+    throw new Error(desc);
+  }
+
+  try {
+    try {
+      await driver.get(url);
+    } catch (err: any) {
+      const desc = `Failed to navigate to URL "${url}": ${err.message}`;
+      logger.error(desc);
+      throw new Error(desc);
+    }
+
+    try {
+      await driver.wait(async () => {
+        return await driver.executeScript(
+          "return document.readyState === 'complete'"
+        );
+      }, 5000);
+    } catch (err: any) {
+      logger.error(`Page load timed out for "${url}", attempting to read partial content: ${err.message}`);
+      // do not throw, attempt to read
+    }
+
+    let readableText: string;
+    try {
+      readableText = await driver.executeScript(
+        "return document.body.innerText;"
+      ) as string;
+    } catch (err: any) {
+      const desc = `Failed to extract page text from "${url}": ${err.message}`;
+      logger.error(desc);
+      throw new Error(desc);
+    }
+
+    const filteredLines = readableText
+      .split(/\r?\n/)
+      .map(line => line.trim())
+      .filter(line => line.split(/\s+/).length > 1);
+
+    if (filteredLines.length === 0) {
+      const desc = `No content extracted from "${url}"`;
+      logger.error(desc);
+      throw new Error(desc);
+    }
+
+    return filteredLines;
+  } finally {
+    try {
+      await driver.quit();
+    } catch (err: any) {
+      logger.error(`Failed to quit Firefox driver cleanly: ${err.message}`);
+    }
+  }
+}
+
+// console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
+// console.log(await extractWebpageContent("https://badcertificate.int.jeynes.uk/"))
@@ -5,7 +5,7 @@ set -e
 run_agent () {
    echo "Starting LangGraph agent..."
    cd agent
-    npx @langchain/langgraph-cli dev
+    npx @langchain/langgraph-cli@1.1.17 dev
 }

 run_ensemble_service () {
@@ -9,6 +9,7 @@ datasets
 # ROBERTA
 scikit-learn
 transformers[torch]
+sentence_transformers

 # Utils
 numpy
@@ -19,6 +19,9 @@ const MODE = process.env.MODE ?? "claim";

 const MAX_CONCURRENCY = 5;

+const OFFSET = parseInt(process.env.OFFSET ?? "0", 10);
+const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null;
+
 const client = new Client({ apiUrl: API_URL });


@@ -118,7 +121,7 @@ async function processRecord(record: any): Promise<ResultRecord> {
      input: buildAgentInput(record),
      streamMode: "values",
      config: {
-        recursion_limit: 50
+        recursion_limit: 100
      }
    });

@@ -164,10 +167,19 @@ async function processRecord(record: any): Promise<ResultRecord> {
 async function main() {
  console.log("Reading input file...");

-  const records = await loadInputs();
+  const allRecords = await loadInputs();

-  console.log(`Loaded ${records.length} records`);
+  console.log(`Loaded ${allRecords.length} records`);

+  const records = allRecords.slice(
+    OFFSET,
+    LIMIT !== null ? OFFSET + LIMIT : undefined
+  );
+
+  console.log(
+    `Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})`
+  );
+  
  fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });

  const limit = pLimit(MAX_CONCURRENCY);
@@ -0,0 +1,119 @@
+import json
+import argparse
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
+from tqdm import tqdm
+
+def init_driver():
+    options = Options()
+    options.headless = True
+    options.add_argument("--disable-gpu")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--headless")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument("--window-size=1920,1080")
+    prefs = {
+        "profile.managed_default_content_settings.images": 2,  # block images
+        "profile.default_content_setting_values.stylesheets": 2,  # block CSS
+        "profile.managed_default_content_settings.cookies": 2,  # optional
+    }
+    options.add_experimental_option("prefs", prefs)
+
+    driver = webdriver.Chrome(options=options)
+    driver.set_page_load_timeout(30)
+    return driver
+
+def is_root_url(url):
+    parsed = urlparse(url)
+    return parsed.path in ("", "/")
+
+def is_404_page(driver):
+    """Safely check for 404, handling stale elements."""
+    try:
+        title = driver.title.lower()
+        body_text = driver.find_element("tag name", "body").text.lower()
+        return "404" in title or "404" in body_text
+    except StaleElementReferenceException:
+        return False
+    except Exception:
+        return False
+
+def check_url_selenium(url):
+    driver = None
+    try:
+        driver = init_driver()
+        driver.get(url)
+        # 404 check
+        if is_404_page(driver):
+            return False, "404 page detected"
+        # Root URL after redirects
+        final_url = driver.current_url
+        if is_root_url(final_url):
+            return False, f"Redirected to root URL ({final_url})"
+        return True, None
+    except (WebDriverException, TimeoutException) as e:
+        return False, str(e)
+    finally:
+        if driver:
+            driver.quit()
+
+def process_event(event):
+    """Process an event only if score > 0.4."""
+    score = event.get("score", 0)
+    if score <= 0.4:
+        return None, False, "Score too low"
+    url = event.get("Url")
+    if not url:
+        return None, False, "No URL"
+    is_valid, error_msg = check_url_selenium(url)
+    event["url_valid"] = is_valid
+    return url, is_valid, error_msg
+
+def process_jsonl_file(file_path, max_workers=4):
+    invalid_urls = []
+    valid_urls = 0
+
+    # Gather events with score > 0.4
+    urls_to_check = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line_data = json.loads(line)
+            if line_data.get("status") != "success":
+                continue
+            for event in line_data.get("events", []):
+                if event.get("score", 0) > 0.4:
+                    urls_to_check.append(event)
+
+    total_urls = len(urls_to_check)
+
+    # ThreadPoolExecutor with tqdm progress bar
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
+        for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
+            url, is_valid, error_msg = future.result()
+            if not is_valid and url:
+                invalid_urls.append((url, error_msg))
+            else:
+                valid_urls += 1
+
+    # Summary
+    if invalid_urls:
+        print("\nList of invalid URLs and reasons:")
+        for url, err in invalid_urls:
+            print(f"{url} --> {err}")
+    print("\n=== URL Validation Summary ===")
+    print(f"Total URLs processed: {total_urls}")
+    print(f"Valid URLs (loaded successfully): {valid_urls}")
+    print(f"Invalid URLs: {len(invalid_urls)}")
+    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
+    parser.add_argument("file_path", type=str, help="Path to the JSONL file")
+    parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
+    args = parser.parse_args()
+
+    process_jsonl_file(args.file_path, max_workers=args.workers)
@@ -27,7 +27,7 @@ DEFAULT_PARAMS = [
    ("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
 ]

-NUM_RANDOM_CLAIMS = 200
+NUM_RANDOM_CLAIMS = 2000

 INPUT_FILE = "../../data/input.jsonl"
 OUTPUT_FILE = "../../data/claims.json"
Author	SHA1	Message	Date
William Jeynes	4e0bab9897	Update README, lock langchain CLI to specific version	2026-05-07 18:45:12 +01:00
William Jeynes	c4dac3f515	Remove some very unused prompts	2026-05-03 21:46:54 +01:00
William Jeynes	2252a42466	Add database link to README	2026-04-09 15:46:18 +01:00
William Jeynes	75ca1032a6	Add offset and limit in pereparation for the large dataset	2026-04-05 22:47:25 +01:00
William Jeynes	00d129bd28	add % valid URLs for different model	2026-04-05 12:31:09 +01:00
William Jeynes	cf923d6e87	Add new accuracy results	2026-04-05 11:51:28 +01:00
William Jeynes	f821e9643d	Add url validity metrics	2026-04-04 20:02:25 +01:00
William Jeynes	43ecd04135	add multithreading	2026-04-04 19:42:02 +01:00
William Jeynes	8c0921057b	start on work to calculate % if valid URLS	2026-04-04 18:52:47 +01:00
William Jeynes	b610e8c989	Add sentence transformers to requirements for ensemble service	2026-03-31 15:52:14 +01:00
William Jeynes	f8d4155b7c	Add more robust parsing of LLM JSON output	2026-03-27 11:09:59 +00:00
William Jeynes	5e374a8bd6	Fix errors seen during longer runs: selenium exceptions, insecure certificates, recusrsion limit exceeded, BM25 document corpus too small	2026-03-26 12:22:13 +00:00