Move all data to own folder. Add run shell script. Experiment (unsuccessfully so far) with example retreival

This commit is contained in:
William Jeynes
2026-02-16 22:42:13 +00:00
parent 90894b2c10
commit 3f14b61cd4
16 changed files with 184 additions and 29 deletions
+1 -2
View File
@@ -1,6 +1,5 @@
# -------- Ours --------
tools/clan/*.csv
tools/clan/*.json
# --------- Github -----------
# Logs
+5 -1
View File
@@ -10,6 +10,7 @@ import { produceRanking } from "./nodes/produceRanking";
import { createModelNode } from "./nodes/model";
import { loopEndConditional } from "./conditionals/loop_end";
import { sort } from "./nodes/sort";
import { triggerEventSetup } from "./nodes/triggerEventSetup";
const triggerEventToolNode = createToolNode(triggerEventToolsByName);
@@ -26,6 +27,7 @@ const agent = new StateGraph(MessagesState)
.addNode(normalizationSetup.name, normalizationSetup)
.addNode("normalisationModel", normalisationModel)
.addNode(triggerEventSetup.name, triggerEventSetup)
.addNode("triggerEventToolNode", triggerEventToolNode)
.addNode("triggerEventModel", triggerEventModel)
@@ -39,7 +41,9 @@ const agent = new StateGraph(MessagesState)
.addEdge(START, normalizationSetup.name)
.addEdge(normalizationSetup.name, "normalisationModel")
.addEdge("normalisationModel", "triggerEventModel")
.addEdge("normalisationModel", triggerEventSetup.name)
.addEdge(triggerEventSetup.name, "triggerEventModel")
// @ts-expect-error
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
+2 -2
View File
@@ -1,10 +1,10 @@
import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankFromCSV } from "../tools/clan/retreiveExamples";
import { rankNormalizedClaims } from "../tools/retreiveExamples";
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
let similarityResults = await rankFromCSV(state.disinformationTitle)
let similarityResults = await rankNormalizedClaims(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
+17
View File
@@ -0,0 +1,17 @@
import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state";
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { rankExampleTriggerEvents, rankNormalizedClaims } from "../tools/retreiveExamples";
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
//Now give in-context examples. hopwfully we can self-teach?
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Event: ${item.rawtext}, Claims and given scores: ${item.cleantext}`)
})
return { messages: messages, disinformationTitle: state.disinformationTitle, normalizedClaim: nc };
};
+1
View File
@@ -9,6 +9,7 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
raw = raw.replace("###TITLE###", state.disinformationTitle);
raw = raw.replace("###LM###", state.messages.at(-1).content);
raw = raw.replace("###NTITLE###", state.normalizedClaim);
if (raw.indexOf("###TECLAIM###") != -1) {
const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event
+3 -1
View File
@@ -2,7 +2,7 @@ You are an agent in a pipeline to analyse disinformation.
Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts.
There is a false disinformation claim circulating:
###LM###
("###NTITLE###
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}}
@@ -24,4 +24,6 @@ Use your abilities to look between the lines and produce some insightful analysi
Events will be reordered as part of processing, each statement must stand alone
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given, to help understanding the intended quality of analysis
Lets go through it step by step
+1
View File
@@ -21,5 +21,6 @@ export const MessagesState = new StateSchema({
messages: MessagesValue,
proposedTriggerEvent: ProposedTriggerEventArray,
proposedTriggerEventIndex: z.int(),
normalizedClaim: z.string(),
});
@@ -3,14 +3,17 @@ import fs from "fs";
import { pipeline, cos_sim } from "@huggingface/transformers";
import bm25Factory from "wink-bm25-text-search";
import nlp from "wink-nlp-utils";
import { logger } from "../../utils/logger";
import { logger } from "../utils/logger";
import readline from "readline";
const CSV_PATHS = [
"./tools/clan/dev-eng.csv",
"./tools/clan/train-eng.csv",
"../data/dev-eng.csv",
"../data/train-eng.csv",
];
const CACHE_PATH = "./tools/clan/csv.cache.json";
const CACHE_PATH = "../data/csv.cache.json";
const JSONL_PATH = "../data/results.jsonl"
type EmbeddingCache = {
rawtexts: string[];
@@ -36,6 +39,13 @@ let csvEmbeddings: number[][] = [];
let csvBM25: any = null;
let csvLoaded = false;
let jsonlRawtexts: string[] = [];
let jsonlCleantexts: string[] = [];
let jsonlEmbeddings: number[][] = [];
let jsonlBM25: any = null;
let jsonlLoaded = false;
logger.info("Loading embedding model...");
const featureExtractor = await pipeline(
"feature-extraction",
@@ -43,12 +53,12 @@ const featureExtractor = await pipeline(
);
logger.info("Embedding model loaded");
//Cached entrypoint
export async function rankFromCSV(
//Cached entrypoints
export async function rankNormalizedClaims(
query: string,
topK = 5
): Promise<RankedResult[]> {
await ensureCSVLoaded();
await ensureNormalizedClaimCSVLoaded();
logger.info("Ranking from CSV cache...");
@@ -78,6 +88,40 @@ export async function rankFromCSV(
return ranked.slice(0, topK);
}
export async function rankExampleTriggerEvents(
query: string,
topK = 5
): Promise<RankedResult[]> {
await ensureExampleClaimJsonlLoaded();
logger.info("Ranking from JSONL cache...");
const queryEmbedding = await embedText(query);
const denseScores = jsonlEmbeddings.map((docEmbedding) =>
cos_sim(docEmbedding, queryEmbedding)
);
const sparseScores = computeSparseScores(query, jsonlBM25, jsonlRawtexts);
const fusedScores = reciprocalRankFusion([denseScores, sparseScores]);
const ranked = jsonlRawtexts
.map((text, i) => ({
id: i,
rawtext: text,
cleantext: jsonlCleantexts[i],
denseScore: denseScores[i],
sparseScore: sparseScores[i],
fusedScore: fusedScores[i],
}))
.sort((a, b) => b.fusedScore - a.fusedScore);
logger.info("Ranking complete (JSONL mode)");
return ranked.slice(0, topK);
}
//Dynamic Entrypoint
export async function rankDynamically(
query: string,
@@ -121,7 +165,7 @@ export async function rankDynamically(
}
//CSV stuff
async function ensureCSVLoaded(): Promise<void> {
async function ensureNormalizedClaimCSVLoaded(): Promise<void> {
if (csvLoaded) return;
logger.info("Initializing CSV ranking mode...");
@@ -143,7 +187,7 @@ async function ensureCSVLoaded(): Promise<void> {
const seen = new Set<string>();
for (const path of CSV_PATHS) {
await processCSV(path, seen);
await processNormalizationCSV(path, seen);
}
const cache: EmbeddingCache = {
@@ -162,7 +206,7 @@ async function ensureCSVLoaded(): Promise<void> {
logger.info("CSV mode ready");
}
async function processCSV(
async function processNormalizationCSV(
path: string,
seen: Set<string>
): Promise<void> {
@@ -190,6 +234,40 @@ async function processCSV(
logger.info("Finished CSV: %s", path);
}
async function ensureExampleClaimJsonlLoaded(): Promise<void> {
if (jsonlLoaded) return;
logger.info("Initializing JSONL ranking...");
//TODO: make jsonl parsins
const stream = fs.createReadStream(JSONL_PATH);
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity,
});
for await (const line of rl) {
if (!line.trim()) continue; // skip empty lines
const row = JSON.parse(line);
const text = row.text;
const embedding = await embedText(text);
jsonlRawtexts.push(text);
jsonlCleantexts.push(row.output[0].content);
jsonlEmbeddings.push(embedding);
}
jsonlBM25 = buildBM25(jsonlRawtexts);
jsonlLoaded = true;
logger.info("JSONL ranking done");
}
async function embedText(text: string): Promise<number[]> {
const output = await featureExtractor(text, {
@@ -276,4 +354,7 @@ function reciprocalRankFusion(
// { id: 5, rawtext: "She lost her balance and fell down the stairs." },
// ]
// )
// );
// );
// await ensureExampleClaimJsonlLoaded()
// console.log(await rankExampleTriggerEvents("Niger"))
+1 -1
View File
@@ -2,7 +2,7 @@ import { tool } from "@langchain/core/tools";
import * as z from "zod";
import { queryScraper } from "./webSearch";
import { extractWebpageContent } from "./webpageFetch";
import { rankDynamically } from "./clan/retreiveExamples";
import { rankDynamically } from "./retreiveExamples";
export async function rankAndDisplayData(data: string[], context: string):Promise<string> {