diff --git a/agent/.gitignore b/agent/.gitignore index 903d1e5..220584b 100644 --- a/agent/.gitignore +++ b/agent/.gitignore @@ -1,6 +1,5 @@ # -------- Ours -------- -tools/clan/*.csv -tools/clan/*.json + # --------- Github ----------- # Logs diff --git a/agent/agent.ts b/agent/agent.ts index 897b718..0539f88 100644 --- a/agent/agent.ts +++ b/agent/agent.ts @@ -10,6 +10,7 @@ import { produceRanking } from "./nodes/produceRanking"; import { createModelNode } from "./nodes/model"; import { loopEndConditional } from "./conditionals/loop_end"; import { sort } from "./nodes/sort"; +import { triggerEventSetup } from "./nodes/triggerEventSetup"; const triggerEventToolNode = createToolNode(triggerEventToolsByName); @@ -26,6 +27,7 @@ const agent = new StateGraph(MessagesState) .addNode(normalizationSetup.name, normalizationSetup) .addNode("normalisationModel", normalisationModel) + .addNode(triggerEventSetup.name, triggerEventSetup) .addNode("triggerEventToolNode", triggerEventToolNode) .addNode("triggerEventModel", triggerEventModel) @@ -39,7 +41,9 @@ const agent = new StateGraph(MessagesState) .addEdge(START, normalizationSetup.name) .addEdge(normalizationSetup.name, "normalisationModel") - .addEdge("normalisationModel", "triggerEventModel") + .addEdge("normalisationModel", triggerEventSetup.name) + + .addEdge(triggerEventSetup.name, "triggerEventModel") // @ts-expect-error .addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name]) diff --git a/agent/nodes/normalizationSetup.ts b/agent/nodes/normalizationSetup.ts index bc2d661..fee9422 100644 --- a/agent/nodes/normalizationSetup.ts +++ b/agent/nodes/normalizationSetup.ts @@ -1,10 +1,10 @@ import { GraphNode } from "@langchain/langgraph"; import { MessagesState } from "../state"; import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages"; -import { rankFromCSV } from "../tools/clan/retreiveExamples"; +import { rankNormalizedClaims } from "../tools/retreiveExamples"; export const normalizationSetup: GraphNode = async (state) => { - let similarityResults = await rankFromCSV(state.disinformationTitle) + let similarityResults = await rankNormalizedClaims(state.disinformationTitle) let messages : BaseMessage[] = similarityResults.map((item) => { return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`) diff --git a/agent/nodes/triggerEventSetup.ts b/agent/nodes/triggerEventSetup.ts new file mode 100644 index 0000000..f2b5b75 --- /dev/null +++ b/agent/nodes/triggerEventSetup.ts @@ -0,0 +1,17 @@ +import { GraphNode } from "@langchain/langgraph"; +import { MessagesState } from "../state"; +import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages"; +import { rankExampleTriggerEvents, rankNormalizedClaims } from "../tools/retreiveExamples"; + +export const triggerEventSetup: GraphNode = async (state) => { + let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp + + //Now give in-context examples. hopwfully we can self-teach? + let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle) + + let messages : BaseMessage[] = similarityResults.map((item) => { + return new AIMessage(`Event: ${item.rawtext}, Claims and given scores: ${item.cleantext}`) + }) + + return { messages: messages, disinformationTitle: state.disinformationTitle, normalizedClaim: nc }; +}; \ No newline at end of file diff --git a/agent/prompts/hydratePrompt.ts b/agent/prompts/hydratePrompt.ts index 555a42a..9b71810 100644 --- a/agent/prompts/hydratePrompt.ts +++ b/agent/prompts/hydratePrompt.ts @@ -9,6 +9,7 @@ export async function hydratePrompt(path: string, state: any) : Promise raw = raw.replace("###TITLE###", state.disinformationTitle); raw = raw.replace("###LM###", state.messages.at(-1).content); + raw = raw.replace("###NTITLE###", state.normalizedClaim); if (raw.indexOf("###TECLAIM###") != -1) { const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event diff --git a/agent/prompts/trigger.txt b/agent/prompts/trigger.txt index 3045a76..7076aef 100644 --- a/agent/prompts/trigger.txt +++ b/agent/prompts/trigger.txt @@ -2,7 +2,7 @@ You are an agent in a pipeline to analyse disinformation. Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts. There is a false disinformation claim circulating: -###LM### +("###NTITLE### Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation. Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}} @@ -24,4 +24,6 @@ Use your abilities to look between the lines and produce some insightful analysi Events will be reordered as part of processing, each statement must stand alone +The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given, to help understanding the intended quality of analysis + Lets go through it step by step \ No newline at end of file diff --git a/agent/state.ts b/agent/state.ts index d98bcc9..9a69b7f 100644 --- a/agent/state.ts +++ b/agent/state.ts @@ -21,5 +21,6 @@ export const MessagesState = new StateSchema({ messages: MessagesValue, proposedTriggerEvent: ProposedTriggerEventArray, proposedTriggerEventIndex: z.int(), + normalizedClaim: z.string(), }); diff --git a/agent/tools/clan/retreiveExamples.ts b/agent/tools/retreiveExamples.ts similarity index 72% rename from agent/tools/clan/retreiveExamples.ts rename to agent/tools/retreiveExamples.ts index 514f6b2..c595c68 100644 --- a/agent/tools/clan/retreiveExamples.ts +++ b/agent/tools/retreiveExamples.ts @@ -3,14 +3,17 @@ import fs from "fs"; import { pipeline, cos_sim } from "@huggingface/transformers"; import bm25Factory from "wink-bm25-text-search"; import nlp from "wink-nlp-utils"; -import { logger } from "../../utils/logger"; +import { logger } from "../utils/logger"; +import readline from "readline"; const CSV_PATHS = [ - "./tools/clan/dev-eng.csv", - "./tools/clan/train-eng.csv", + "../data/dev-eng.csv", + "../data/train-eng.csv", ]; -const CACHE_PATH = "./tools/clan/csv.cache.json"; +const CACHE_PATH = "../data/csv.cache.json"; + +const JSONL_PATH = "../data/results.jsonl" type EmbeddingCache = { rawtexts: string[]; @@ -36,6 +39,13 @@ let csvEmbeddings: number[][] = []; let csvBM25: any = null; let csvLoaded = false; +let jsonlRawtexts: string[] = []; +let jsonlCleantexts: string[] = []; +let jsonlEmbeddings: number[][] = []; +let jsonlBM25: any = null; +let jsonlLoaded = false; + + logger.info("Loading embedding model..."); const featureExtractor = await pipeline( "feature-extraction", @@ -43,12 +53,12 @@ const featureExtractor = await pipeline( ); logger.info("Embedding model loaded"); -//Cached entrypoint -export async function rankFromCSV( +//Cached entrypoints +export async function rankNormalizedClaims( query: string, topK = 5 ): Promise { - await ensureCSVLoaded(); + await ensureNormalizedClaimCSVLoaded(); logger.info("Ranking from CSV cache..."); @@ -78,6 +88,40 @@ export async function rankFromCSV( return ranked.slice(0, topK); } +export async function rankExampleTriggerEvents( + query: string, + topK = 5 +): Promise { + await ensureExampleClaimJsonlLoaded(); + + logger.info("Ranking from JSONL cache..."); + + const queryEmbedding = await embedText(query); + + const denseScores = jsonlEmbeddings.map((docEmbedding) => + cos_sim(docEmbedding, queryEmbedding) + ); + + const sparseScores = computeSparseScores(query, jsonlBM25, jsonlRawtexts); + + const fusedScores = reciprocalRankFusion([denseScores, sparseScores]); + + const ranked = jsonlRawtexts + .map((text, i) => ({ + id: i, + rawtext: text, + cleantext: jsonlCleantexts[i], + denseScore: denseScores[i], + sparseScore: sparseScores[i], + fusedScore: fusedScores[i], + })) + .sort((a, b) => b.fusedScore - a.fusedScore); + + logger.info("Ranking complete (JSONL mode)"); + + return ranked.slice(0, topK); +} + //Dynamic Entrypoint export async function rankDynamically( query: string, @@ -121,7 +165,7 @@ export async function rankDynamically( } //CSV stuff -async function ensureCSVLoaded(): Promise { +async function ensureNormalizedClaimCSVLoaded(): Promise { if (csvLoaded) return; logger.info("Initializing CSV ranking mode..."); @@ -143,7 +187,7 @@ async function ensureCSVLoaded(): Promise { const seen = new Set(); for (const path of CSV_PATHS) { - await processCSV(path, seen); + await processNormalizationCSV(path, seen); } const cache: EmbeddingCache = { @@ -162,7 +206,7 @@ async function ensureCSVLoaded(): Promise { logger.info("CSV mode ready"); } -async function processCSV( +async function processNormalizationCSV( path: string, seen: Set ): Promise { @@ -190,6 +234,40 @@ async function processCSV( logger.info("Finished CSV: %s", path); } +async function ensureExampleClaimJsonlLoaded(): Promise { + if (jsonlLoaded) return; + + logger.info("Initializing JSONL ranking..."); + //TODO: make jsonl parsins + const stream = fs.createReadStream(JSONL_PATH); + + const rl = readline.createInterface({ + input: stream, + crlfDelay: Infinity, + }); + + for await (const line of rl) { + if (!line.trim()) continue; // skip empty lines + + const row = JSON.parse(line); + + const text = row.text; + + const embedding = await embedText(text); + + jsonlRawtexts.push(text); + + jsonlCleantexts.push(row.output[0].content); + jsonlEmbeddings.push(embedding); + } + + + jsonlBM25 = buildBM25(jsonlRawtexts); + + jsonlLoaded = true; + logger.info("JSONL ranking done"); +} + async function embedText(text: string): Promise { const output = await featureExtractor(text, { @@ -276,4 +354,7 @@ function reciprocalRankFusion( // { id: 5, rawtext: "She lost her balance and fell down the stairs." }, // ] // ) -// ); \ No newline at end of file +// ); + +// await ensureExampleClaimJsonlLoaded() +// console.log(await rankExampleTriggerEvents("Niger")) \ No newline at end of file diff --git a/agent/tools/triggerEventTools.ts b/agent/tools/triggerEventTools.ts index 773e2be..595bebe 100644 --- a/agent/tools/triggerEventTools.ts +++ b/agent/tools/triggerEventTools.ts @@ -2,7 +2,7 @@ import { tool } from "@langchain/core/tools"; import * as z from "zod"; import { queryScraper } from "./webSearch"; import { extractWebpageContent } from "./webpageFetch"; -import { rankDynamically } from "./clan/retreiveExamples"; +import { rankDynamically } from "./retreiveExamples"; export async function rankAndDisplayData(data: string[], context: string):Promise { diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..6390df2 --- /dev/null +++ b/run.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -e + +run_agent () { + echo "Starting LangGraph agent..." + cd agent + npx @langchain/langgraph-cli dev +} + +run_ragas_service () { + echo "Starting RAGAS service..." + cd "supporting/RAGAS_Service" + .venv/bin/uvicorn ragas_service:app --port 8001 +} + +run_frontend () { + echo "Starting frontend (Streamlit)..." + cd "supporting/scorer" + .venv/bin/streamlit run display.py +} + +run_fetch () { + echo "Running fetch job..." + cd "supporting/dbkf" + python fetch.py +} + +run_wrapper () { + echo "Running wrapper..." + cd "supporting/Wrapper" + npm run dev +} + +run_analysis () { + cd supporting/scorer + python analyse.py +} + +case "$1" in + agent) run_agent ;; + ragas_service) run_ragas_service ;; + frontend) run_frontend ;; + fetch) run_fetch ;; + wrapper) run_wrapper ;; + analysis) run_analysis ;; + *) + echo "Unknown command: $1" + echo "Usage: ./runproject [agent|ragas_service|frontend|fetch|wrapper|analysis]" + exit 1 + ;; +esac diff --git a/supporting/Wrapper/.gitignore b/supporting/Wrapper/.gitignore index cfdeb50..e9e726a 100644 --- a/supporting/Wrapper/.gitignore +++ b/supporting/Wrapper/.gitignore @@ -1,7 +1,3 @@ -# -------- Ours -------- -claims.json -results*.jsonl - # --------- Github ----------- # Logs logs diff --git a/supporting/Wrapper/run.ts b/supporting/Wrapper/run.ts index 13a8bcb..97b93d7 100644 --- a/supporting/Wrapper/run.ts +++ b/supporting/Wrapper/run.ts @@ -5,8 +5,8 @@ import cliProgress from "cli-progress"; import pLimit from "p-limit"; -const INPUT_FILE = "./claims.json"; -const OUTPUT_FILE = "./results.jsonl"; +const INPUT_FILE = "../../data/claims.json"; +const OUTPUT_FILE = "../../data/results.jsonl"; const API_URL = "http://localhost:2024"; const AGENT_NAME = "agent"; const MAX_CONCURRENCY = 50; diff --git a/supporting/dbkf/fetch.py b/supporting/dbkf/fetch.py index 34ef503..3ba8220 100644 --- a/supporting/dbkf/fetch.py +++ b/supporting/dbkf/fetch.py @@ -13,16 +13,16 @@ DEFAULT_PARAMS = { "concept": "http://weverify.eu/resource/Concept/Q212", "documentTypes": "http://schema.org/Claim", "from": "2000-01-01", - "to": "2023-10-17", + "to": "2026-10-17", "lang": "en", - "limit": 50, # Max per page + "limit": 300, # Max per page "page": 1, "orderBy": "date" } NUM_RANDOM_CLAIMS = 10 -OUTPUT_FILE = "../Wrapper/claims.json" +OUTPUT_FILE = "../../data/claims.json" def fetch_claims(params=None): if params is None: diff --git a/supporting/scorer/analyse.py b/supporting/scorer/analyse.py index 3db6019..c5cb6d5 100644 --- a/supporting/scorer/analyse.py +++ b/supporting/scorer/analyse.py @@ -4,7 +4,7 @@ from statistics import mean # ------------------------------------------------------------ # Load JSONL file # ------------------------------------------------------------ -DATA_FILE = "../Wrapper/results.jsonl" +DATA_FILE = "../../data/results.jsonl" data = [] with open(DATA_FILE, "r", encoding="utf-8") as f: diff --git a/supporting/scorer/display.py b/supporting/scorer/display.py index b164fe2..2a1150c 100644 --- a/supporting/scorer/display.py +++ b/supporting/scorer/display.py @@ -4,7 +4,7 @@ import random from pathlib import Path # Path to your JSONL file -DATA_FILE = "../Wrapper/results.jsonl" +DATA_FILE = "../../data/results.jsonl" # -------------------------- # Helper functions