Move all data to own folder. Add run shell script. Experiment (unsuccessfully so far) with example retreival

This commit is contained in:
William Jeynes
2026-02-16 22:42:13 +00:00
parent 90894b2c10
commit 3f14b61cd4
16 changed files with 184 additions and 29 deletions
+1 -2
View File
@@ -1,6 +1,5 @@
# -------- Ours --------
tools/clan/*.csv
tools/clan/*.json
# --------- Github -----------
# Logs
+5 -1
View File
@@ -10,6 +10,7 @@ import { produceRanking } from "./nodes/produceRanking";
import { createModelNode } from "./nodes/model";
import { loopEndConditional } from "./conditionals/loop_end";
import { sort } from "./nodes/sort";
import { triggerEventSetup } from "./nodes/triggerEventSetup";
const triggerEventToolNode = createToolNode(triggerEventToolsByName);
@@ -26,6 +27,7 @@ const agent = new StateGraph(MessagesState)
.addNode(normalizationSetup.name, normalizationSetup)
.addNode("normalisationModel", normalisationModel)
.addNode(triggerEventSetup.name, triggerEventSetup)
.addNode("triggerEventToolNode", triggerEventToolNode)
.addNode("triggerEventModel", triggerEventModel)
@@ -39,7 +41,9 @@ const agent = new StateGraph(MessagesState)
.addEdge(START, normalizationSetup.name)
.addEdge(normalizationSetup.name, "normalisationModel")
.addEdge("normalisationModel", "triggerEventModel")
.addEdge("normalisationModel", triggerEventSetup.name)
.addEdge(triggerEventSetup.name, "triggerEventModel")
// @ts-expect-error
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
+2 -2
View File
@@ -1,10 +1,10 @@
import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankFromCSV } from "../tools/clan/retreiveExamples";
import { rankNormalizedClaims } from "../tools/retreiveExamples";
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
let similarityResults = await rankFromCSV(state.disinformationTitle)
let similarityResults = await rankNormalizedClaims(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
+17
View File
@@ -0,0 +1,17 @@
import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankExampleTriggerEvents, rankNormalizedClaims } from "../tools/retreiveExamples";
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
//Now give in-context examples. hopwfully we can self-teach?
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Event: ${item.rawtext}, Claims and given scores: ${item.cleantext}`)
})
return { messages: messages, disinformationTitle: state.disinformationTitle, normalizedClaim: nc };
};
+1
View File
@@ -9,6 +9,7 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
raw = raw.replace("###TITLE###", state.disinformationTitle);
raw = raw.replace("###LM###", state.messages.at(-1).content);
raw = raw.replace("###NTITLE###", state.normalizedClaim);
if (raw.indexOf("###TECLAIM###") != -1) {
const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event
+3 -1
View File
@@ -2,7 +2,7 @@ You are an agent in a pipeline to analyse disinformation.
Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts.
There is a false disinformation claim circulating:
###LM###
("###NTITLE###
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}}
@@ -24,4 +24,6 @@ Use your abilities to look between the lines and produce some insightful analysi
Events will be reordered as part of processing, each statement must stand alone
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given, to help understanding the intended quality of analysis
Lets go through it step by step
+1
View File
@@ -21,5 +21,6 @@ export const MessagesState = new StateSchema({
messages: MessagesValue,
proposedTriggerEvent: ProposedTriggerEventArray,
proposedTriggerEventIndex: z.int(),
normalizedClaim: z.string(),
});
@@ -3,14 +3,17 @@ import fs from "fs";
import { pipeline, cos_sim } from "@huggingface/transformers";
import bm25Factory from "wink-bm25-text-search";
import nlp from "wink-nlp-utils";
import { logger } from "../../utils/logger";
import { logger } from "../utils/logger";
import readline from "readline";
const CSV_PATHS = [
"./tools/clan/dev-eng.csv",
"./tools/clan/train-eng.csv",
"../data/dev-eng.csv",
"../data/train-eng.csv",
];
const CACHE_PATH = "./tools/clan/csv.cache.json";
const CACHE_PATH = "../data/csv.cache.json";
const JSONL_PATH = "../data/results.jsonl"
type EmbeddingCache = {
rawtexts: string[];
@@ -36,6 +39,13 @@ let csvEmbeddings: number[][] = [];
let csvBM25: any = null;
let csvLoaded = false;
let jsonlRawtexts: string[] = [];
let jsonlCleantexts: string[] = [];
let jsonlEmbeddings: number[][] = [];
let jsonlBM25: any = null;
let jsonlLoaded = false;
logger.info("Loading embedding model...");
const featureExtractor = await pipeline(
"feature-extraction",
@@ -43,12 +53,12 @@ const featureExtractor = await pipeline(
);
logger.info("Embedding model loaded");
//Cached entrypoint
export async function rankFromCSV(
//Cached entrypoints
export async function rankNormalizedClaims(
query: string,
topK = 5
): Promise<RankedResult[]> {
await ensureCSVLoaded();
await ensureNormalizedClaimCSVLoaded();
logger.info("Ranking from CSV cache...");
@@ -78,6 +88,40 @@ export async function rankFromCSV(
return ranked.slice(0, topK);
}
export async function rankExampleTriggerEvents(
query: string,
topK = 5
): Promise<RankedResult[]> {
await ensureExampleClaimJsonlLoaded();
logger.info("Ranking from JSONL cache...");
const queryEmbedding = await embedText(query);
const denseScores = jsonlEmbeddings.map((docEmbedding) =>
cos_sim(docEmbedding, queryEmbedding)
);
const sparseScores = computeSparseScores(query, jsonlBM25, jsonlRawtexts);
const fusedScores = reciprocalRankFusion([denseScores, sparseScores]);
const ranked = jsonlRawtexts
.map((text, i) => ({
id: i,
rawtext: text,
cleantext: jsonlCleantexts[i],
denseScore: denseScores[i],
sparseScore: sparseScores[i],
fusedScore: fusedScores[i],
}))
.sort((a, b) => b.fusedScore - a.fusedScore);
logger.info("Ranking complete (JSONL mode)");
return ranked.slice(0, topK);
}
//Dynamic Entrypoint
export async function rankDynamically(
query: string,
@@ -121,7 +165,7 @@ export async function rankDynamically(
}
//CSV stuff
async function ensureCSVLoaded(): Promise<void> {
async function ensureNormalizedClaimCSVLoaded(): Promise<void> {
if (csvLoaded) return;
logger.info("Initializing CSV ranking mode...");
@@ -143,7 +187,7 @@ async function ensureCSVLoaded(): Promise<void> {
const seen = new Set<string>();
for (const path of CSV_PATHS) {
await processCSV(path, seen);
await processNormalizationCSV(path, seen);
}
const cache: EmbeddingCache = {
@@ -162,7 +206,7 @@ async function ensureCSVLoaded(): Promise<void> {
logger.info("CSV mode ready");
}
async function processCSV(
async function processNormalizationCSV(
path: string,
seen: Set<string>
): Promise<void> {
@@ -190,6 +234,40 @@ async function processCSV(
logger.info("Finished CSV: %s", path);
}
async function ensureExampleClaimJsonlLoaded(): Promise<void> {
if (jsonlLoaded) return;
logger.info("Initializing JSONL ranking...");
//TODO: make jsonl parsins
const stream = fs.createReadStream(JSONL_PATH);
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity,
});
for await (const line of rl) {
if (!line.trim()) continue; // skip empty lines
const row = JSON.parse(line);
const text = row.text;
const embedding = await embedText(text);
jsonlRawtexts.push(text);
jsonlCleantexts.push(row.output[0].content);
jsonlEmbeddings.push(embedding);
}
jsonlBM25 = buildBM25(jsonlRawtexts);
jsonlLoaded = true;
logger.info("JSONL ranking done");
}
async function embedText(text: string): Promise<number[]> {
const output = await featureExtractor(text, {
@@ -277,3 +355,6 @@ function reciprocalRankFusion(
// ]
// )
// );
// await ensureExampleClaimJsonlLoaded()
// console.log(await rankExampleTriggerEvents("Niger"))
+1 -1
View File
@@ -2,7 +2,7 @@ import { tool } from "@langchain/core/tools";
import * as z from "zod";
import { queryScraper } from "./webSearch";
import { extractWebpageContent } from "./webpageFetch";
import { rankDynamically } from "./clan/retreiveExamples";
import { rankDynamically } from "./retreiveExamples";
export async function rankAndDisplayData(data: string[], context: string):Promise<string> {
+2
View File
@@ -0,0 +1,2 @@
*
!.gitignore
Executable
+52
View File
@@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -e
run_agent () {
echo "Starting LangGraph agent..."
cd agent
npx @langchain/langgraph-cli dev
}
run_ragas_service () {
echo "Starting RAGAS service..."
cd "supporting/RAGAS_Service"
.venv/bin/uvicorn ragas_service:app --port 8001
}
run_frontend () {
echo "Starting frontend (Streamlit)..."
cd "supporting/scorer"
.venv/bin/streamlit run display.py
}
run_fetch () {
echo "Running fetch job..."
cd "supporting/dbkf"
python fetch.py
}
run_wrapper () {
echo "Running wrapper..."
cd "supporting/Wrapper"
npm run dev
}
run_analysis () {
cd supporting/scorer
python analyse.py
}
case "$1" in
agent) run_agent ;;
ragas_service) run_ragas_service ;;
frontend) run_frontend ;;
fetch) run_fetch ;;
wrapper) run_wrapper ;;
analysis) run_analysis ;;
*)
echo "Unknown command: $1"
echo "Usage: ./runproject [agent|ragas_service|frontend|fetch|wrapper|analysis]"
exit 1
;;
esac
-4
View File
@@ -1,7 +1,3 @@
# -------- Ours --------
claims.json
results*.jsonl
# --------- Github -----------
# Logs
logs
+2 -2
View File
@@ -5,8 +5,8 @@ import cliProgress from "cli-progress";
import pLimit from "p-limit";
const INPUT_FILE = "./claims.json";
const OUTPUT_FILE = "./results.jsonl";
const INPUT_FILE = "../../data/claims.json";
const OUTPUT_FILE = "../../data/results.jsonl";
const API_URL = "http://localhost:2024";
const AGENT_NAME = "agent";
const MAX_CONCURRENCY = 50;
+3 -3
View File
@@ -13,16 +13,16 @@ DEFAULT_PARAMS = {
"concept": "http://weverify.eu/resource/Concept/Q212",
"documentTypes": "http://schema.org/Claim",
"from": "2000-01-01",
"to": "2023-10-17",
"to": "2026-10-17",
"lang": "en",
"limit": 50, # Max per page
"limit": 300, # Max per page
"page": 1,
"orderBy": "date"
}
NUM_RANDOM_CLAIMS = 10
OUTPUT_FILE = "../Wrapper/claims.json"
OUTPUT_FILE = "../../data/claims.json"
def fetch_claims(params=None):
if params is None:
+1 -1
View File
@@ -4,7 +4,7 @@ from statistics import mean
# ------------------------------------------------------------
# Load JSONL file
# ------------------------------------------------------------
DATA_FILE = "../Wrapper/results.jsonl"
DATA_FILE = "../../data/results.jsonl"
data = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
+1 -1
View File
@@ -4,7 +4,7 @@ import random
from pathlib import Path
# Path to your JSONL file
DATA_FILE = "../Wrapper/results.jsonl"
DATA_FILE = "../../data/results.jsonl"
# --------------------------
# Helper functions