Move all data to own folder. Add run shell script. Experiment (unsuccessfully so far) with example retreival

This commit is contained in:
William Jeynes
2026-02-16 22:42:13 +00:00
parent 90894b2c10
commit 3f14b61cd4
16 changed files with 184 additions and 29 deletions
+1 -2
View File
@@ -1,6 +1,5 @@
# -------- Ours -------- # -------- Ours --------
tools/clan/*.csv
tools/clan/*.json
# --------- Github ----------- # --------- Github -----------
# Logs # Logs
+5 -1
View File
@@ -10,6 +10,7 @@ import { produceRanking } from "./nodes/produceRanking";
import { createModelNode } from "./nodes/model"; import { createModelNode } from "./nodes/model";
import { loopEndConditional } from "./conditionals/loop_end"; import { loopEndConditional } from "./conditionals/loop_end";
import { sort } from "./nodes/sort"; import { sort } from "./nodes/sort";
import { triggerEventSetup } from "./nodes/triggerEventSetup";
const triggerEventToolNode = createToolNode(triggerEventToolsByName); const triggerEventToolNode = createToolNode(triggerEventToolsByName);
@@ -26,6 +27,7 @@ const agent = new StateGraph(MessagesState)
.addNode(normalizationSetup.name, normalizationSetup) .addNode(normalizationSetup.name, normalizationSetup)
.addNode("normalisationModel", normalisationModel) .addNode("normalisationModel", normalisationModel)
.addNode(triggerEventSetup.name, triggerEventSetup)
.addNode("triggerEventToolNode", triggerEventToolNode) .addNode("triggerEventToolNode", triggerEventToolNode)
.addNode("triggerEventModel", triggerEventModel) .addNode("triggerEventModel", triggerEventModel)
@@ -39,7 +41,9 @@ const agent = new StateGraph(MessagesState)
.addEdge(START, normalizationSetup.name) .addEdge(START, normalizationSetup.name)
.addEdge(normalizationSetup.name, "normalisationModel") .addEdge(normalizationSetup.name, "normalisationModel")
.addEdge("normalisationModel", "triggerEventModel") .addEdge("normalisationModel", triggerEventSetup.name)
.addEdge(triggerEventSetup.name, "triggerEventModel")
// @ts-expect-error // @ts-expect-error
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name]) .addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
+2 -2
View File
@@ -1,10 +1,10 @@
import { GraphNode } from "@langchain/langgraph"; import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state"; import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages"; import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankFromCSV } from "../tools/clan/retreiveExamples"; import { rankNormalizedClaims } from "../tools/retreiveExamples";
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => { export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
let similarityResults = await rankFromCSV(state.disinformationTitle) let similarityResults = await rankNormalizedClaims(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => { let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`) return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
+17
View File
@@ -0,0 +1,17 @@
import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankExampleTriggerEvents, rankNormalizedClaims } from "../tools/retreiveExamples";
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
//Now give in-context examples. hopwfully we can self-teach?
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Event: ${item.rawtext}, Claims and given scores: ${item.cleantext}`)
})
return { messages: messages, disinformationTitle: state.disinformationTitle, normalizedClaim: nc };
};
+1
View File
@@ -9,6 +9,7 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
raw = raw.replace("###TITLE###", state.disinformationTitle); raw = raw.replace("###TITLE###", state.disinformationTitle);
raw = raw.replace("###LM###", state.messages.at(-1).content); raw = raw.replace("###LM###", state.messages.at(-1).content);
raw = raw.replace("###NTITLE###", state.normalizedClaim);
if (raw.indexOf("###TECLAIM###") != -1) { if (raw.indexOf("###TECLAIM###") != -1) {
const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event
+3 -1
View File
@@ -2,7 +2,7 @@ You are an agent in a pipeline to analyse disinformation.
Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts. Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts.
There is a false disinformation claim circulating: There is a false disinformation claim circulating:
###LM### ("###NTITLE###
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation. Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}} Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}}
@@ -24,4 +24,6 @@ Use your abilities to look between the lines and produce some insightful analysi
Events will be reordered as part of processing, each statement must stand alone Events will be reordered as part of processing, each statement must stand alone
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given, to help understanding the intended quality of analysis
Lets go through it step by step Lets go through it step by step
+1
View File
@@ -21,5 +21,6 @@ export const MessagesState = new StateSchema({
messages: MessagesValue, messages: MessagesValue,
proposedTriggerEvent: ProposedTriggerEventArray, proposedTriggerEvent: ProposedTriggerEventArray,
proposedTriggerEventIndex: z.int(), proposedTriggerEventIndex: z.int(),
normalizedClaim: z.string(),
}); });
@@ -3,14 +3,17 @@ import fs from "fs";
import { pipeline, cos_sim } from "@huggingface/transformers"; import { pipeline, cos_sim } from "@huggingface/transformers";
import bm25Factory from "wink-bm25-text-search"; import bm25Factory from "wink-bm25-text-search";
import nlp from "wink-nlp-utils"; import nlp from "wink-nlp-utils";
import { logger } from "../../utils/logger"; import { logger } from "../utils/logger";
import readline from "readline";
const CSV_PATHS = [ const CSV_PATHS = [
"./tools/clan/dev-eng.csv", "../data/dev-eng.csv",
"./tools/clan/train-eng.csv", "../data/train-eng.csv",
]; ];
const CACHE_PATH = "./tools/clan/csv.cache.json"; const CACHE_PATH = "../data/csv.cache.json";
const JSONL_PATH = "../data/results.jsonl"
type EmbeddingCache = { type EmbeddingCache = {
rawtexts: string[]; rawtexts: string[];
@@ -36,6 +39,13 @@ let csvEmbeddings: number[][] = [];
let csvBM25: any = null; let csvBM25: any = null;
let csvLoaded = false; let csvLoaded = false;
let jsonlRawtexts: string[] = [];
let jsonlCleantexts: string[] = [];
let jsonlEmbeddings: number[][] = [];
let jsonlBM25: any = null;
let jsonlLoaded = false;
logger.info("Loading embedding model..."); logger.info("Loading embedding model...");
const featureExtractor = await pipeline( const featureExtractor = await pipeline(
"feature-extraction", "feature-extraction",
@@ -43,12 +53,12 @@ const featureExtractor = await pipeline(
); );
logger.info("Embedding model loaded"); logger.info("Embedding model loaded");
//Cached entrypoint //Cached entrypoints
export async function rankFromCSV( export async function rankNormalizedClaims(
query: string, query: string,
topK = 5 topK = 5
): Promise<RankedResult[]> { ): Promise<RankedResult[]> {
await ensureCSVLoaded(); await ensureNormalizedClaimCSVLoaded();
logger.info("Ranking from CSV cache..."); logger.info("Ranking from CSV cache...");
@@ -78,6 +88,40 @@ export async function rankFromCSV(
return ranked.slice(0, topK); return ranked.slice(0, topK);
} }
export async function rankExampleTriggerEvents(
query: string,
topK = 5
): Promise<RankedResult[]> {
await ensureExampleClaimJsonlLoaded();
logger.info("Ranking from JSONL cache...");
const queryEmbedding = await embedText(query);
const denseScores = jsonlEmbeddings.map((docEmbedding) =>
cos_sim(docEmbedding, queryEmbedding)
);
const sparseScores = computeSparseScores(query, jsonlBM25, jsonlRawtexts);
const fusedScores = reciprocalRankFusion([denseScores, sparseScores]);
const ranked = jsonlRawtexts
.map((text, i) => ({
id: i,
rawtext: text,
cleantext: jsonlCleantexts[i],
denseScore: denseScores[i],
sparseScore: sparseScores[i],
fusedScore: fusedScores[i],
}))
.sort((a, b) => b.fusedScore - a.fusedScore);
logger.info("Ranking complete (JSONL mode)");
return ranked.slice(0, topK);
}
//Dynamic Entrypoint //Dynamic Entrypoint
export async function rankDynamically( export async function rankDynamically(
query: string, query: string,
@@ -121,7 +165,7 @@ export async function rankDynamically(
} }
//CSV stuff //CSV stuff
async function ensureCSVLoaded(): Promise<void> { async function ensureNormalizedClaimCSVLoaded(): Promise<void> {
if (csvLoaded) return; if (csvLoaded) return;
logger.info("Initializing CSV ranking mode..."); logger.info("Initializing CSV ranking mode...");
@@ -143,7 +187,7 @@ async function ensureCSVLoaded(): Promise<void> {
const seen = new Set<string>(); const seen = new Set<string>();
for (const path of CSV_PATHS) { for (const path of CSV_PATHS) {
await processCSV(path, seen); await processNormalizationCSV(path, seen);
} }
const cache: EmbeddingCache = { const cache: EmbeddingCache = {
@@ -162,7 +206,7 @@ async function ensureCSVLoaded(): Promise<void> {
logger.info("CSV mode ready"); logger.info("CSV mode ready");
} }
async function processCSV( async function processNormalizationCSV(
path: string, path: string,
seen: Set<string> seen: Set<string>
): Promise<void> { ): Promise<void> {
@@ -190,6 +234,40 @@ async function processCSV(
logger.info("Finished CSV: %s", path); logger.info("Finished CSV: %s", path);
} }
async function ensureExampleClaimJsonlLoaded(): Promise<void> {
if (jsonlLoaded) return;
logger.info("Initializing JSONL ranking...");
//TODO: make jsonl parsins
const stream = fs.createReadStream(JSONL_PATH);
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity,
});
for await (const line of rl) {
if (!line.trim()) continue; // skip empty lines
const row = JSON.parse(line);
const text = row.text;
const embedding = await embedText(text);
jsonlRawtexts.push(text);
jsonlCleantexts.push(row.output[0].content);
jsonlEmbeddings.push(embedding);
}
jsonlBM25 = buildBM25(jsonlRawtexts);
jsonlLoaded = true;
logger.info("JSONL ranking done");
}
async function embedText(text: string): Promise<number[]> { async function embedText(text: string): Promise<number[]> {
const output = await featureExtractor(text, { const output = await featureExtractor(text, {
@@ -276,4 +354,7 @@ function reciprocalRankFusion(
// { id: 5, rawtext: "She lost her balance and fell down the stairs." }, // { id: 5, rawtext: "She lost her balance and fell down the stairs." },
// ] // ]
// ) // )
// ); // );
// await ensureExampleClaimJsonlLoaded()
// console.log(await rankExampleTriggerEvents("Niger"))
+1 -1
View File
@@ -2,7 +2,7 @@ import { tool } from "@langchain/core/tools";
import * as z from "zod"; import * as z from "zod";
import { queryScraper } from "./webSearch"; import { queryScraper } from "./webSearch";
import { extractWebpageContent } from "./webpageFetch"; import { extractWebpageContent } from "./webpageFetch";
import { rankDynamically } from "./clan/retreiveExamples"; import { rankDynamically } from "./retreiveExamples";
export async function rankAndDisplayData(data: string[], context: string):Promise<string> { export async function rankAndDisplayData(data: string[], context: string):Promise<string> {
+2
View File
@@ -0,0 +1,2 @@
*
!.gitignore
Executable
+52
View File
@@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -e
run_agent () {
echo "Starting LangGraph agent..."
cd agent
npx @langchain/langgraph-cli dev
}
run_ragas_service () {
echo "Starting RAGAS service..."
cd "supporting/RAGAS_Service"
.venv/bin/uvicorn ragas_service:app --port 8001
}
run_frontend () {
echo "Starting frontend (Streamlit)..."
cd "supporting/scorer"
.venv/bin/streamlit run display.py
}
run_fetch () {
echo "Running fetch job..."
cd "supporting/dbkf"
python fetch.py
}
run_wrapper () {
echo "Running wrapper..."
cd "supporting/Wrapper"
npm run dev
}
run_analysis () {
cd supporting/scorer
python analyse.py
}
case "$1" in
agent) run_agent ;;
ragas_service) run_ragas_service ;;
frontend) run_frontend ;;
fetch) run_fetch ;;
wrapper) run_wrapper ;;
analysis) run_analysis ;;
*)
echo "Unknown command: $1"
echo "Usage: ./runproject [agent|ragas_service|frontend|fetch|wrapper|analysis]"
exit 1
;;
esac
-4
View File
@@ -1,7 +1,3 @@
# -------- Ours --------
claims.json
results*.jsonl
# --------- Github ----------- # --------- Github -----------
# Logs # Logs
logs logs
+2 -2
View File
@@ -5,8 +5,8 @@ import cliProgress from "cli-progress";
import pLimit from "p-limit"; import pLimit from "p-limit";
const INPUT_FILE = "./claims.json"; const INPUT_FILE = "../../data/claims.json";
const OUTPUT_FILE = "./results.jsonl"; const OUTPUT_FILE = "../../data/results.jsonl";
const API_URL = "http://localhost:2024"; const API_URL = "http://localhost:2024";
const AGENT_NAME = "agent"; const AGENT_NAME = "agent";
const MAX_CONCURRENCY = 50; const MAX_CONCURRENCY = 50;
+3 -3
View File
@@ -13,16 +13,16 @@ DEFAULT_PARAMS = {
"concept": "http://weverify.eu/resource/Concept/Q212", "concept": "http://weverify.eu/resource/Concept/Q212",
"documentTypes": "http://schema.org/Claim", "documentTypes": "http://schema.org/Claim",
"from": "2000-01-01", "from": "2000-01-01",
"to": "2023-10-17", "to": "2026-10-17",
"lang": "en", "lang": "en",
"limit": 50, # Max per page "limit": 300, # Max per page
"page": 1, "page": 1,
"orderBy": "date" "orderBy": "date"
} }
NUM_RANDOM_CLAIMS = 10 NUM_RANDOM_CLAIMS = 10
OUTPUT_FILE = "../Wrapper/claims.json" OUTPUT_FILE = "../../data/claims.json"
def fetch_claims(params=None): def fetch_claims(params=None):
if params is None: if params is None:
+1 -1
View File
@@ -4,7 +4,7 @@ from statistics import mean
# ------------------------------------------------------------ # ------------------------------------------------------------
# Load JSONL file # Load JSONL file
# ------------------------------------------------------------ # ------------------------------------------------------------
DATA_FILE = "../Wrapper/results.jsonl" DATA_FILE = "../../data/results.jsonl"
data = [] data = []
with open(DATA_FILE, "r", encoding="utf-8") as f: with open(DATA_FILE, "r", encoding="utf-8") as f:
+1 -1
View File
@@ -4,7 +4,7 @@ import random
from pathlib import Path from pathlib import Path
# Path to your JSONL file # Path to your JSONL file
DATA_FILE = "../Wrapper/results.jsonl" DATA_FILE = "../../data/results.jsonl"
# -------------------------- # --------------------------
# Helper functions # Helper functions