Move all data to own folder. Add run shell script. Experiment (unsuccessfully so far) with example retreival
This commit is contained in:
+1
-2
@@ -1,6 +1,5 @@
|
||||
# -------- Ours --------
|
||||
tools/clan/*.csv
|
||||
tools/clan/*.json
|
||||
|
||||
|
||||
# --------- Github -----------
|
||||
# Logs
|
||||
|
||||
+5
-1
@@ -10,6 +10,7 @@ import { produceRanking } from "./nodes/produceRanking";
|
||||
import { createModelNode } from "./nodes/model";
|
||||
import { loopEndConditional } from "./conditionals/loop_end";
|
||||
import { sort } from "./nodes/sort";
|
||||
import { triggerEventSetup } from "./nodes/triggerEventSetup";
|
||||
|
||||
const triggerEventToolNode = createToolNode(triggerEventToolsByName);
|
||||
|
||||
@@ -26,6 +27,7 @@ const agent = new StateGraph(MessagesState)
|
||||
.addNode(normalizationSetup.name, normalizationSetup)
|
||||
.addNode("normalisationModel", normalisationModel)
|
||||
|
||||
.addNode(triggerEventSetup.name, triggerEventSetup)
|
||||
.addNode("triggerEventToolNode", triggerEventToolNode)
|
||||
.addNode("triggerEventModel", triggerEventModel)
|
||||
|
||||
@@ -39,7 +41,9 @@ const agent = new StateGraph(MessagesState)
|
||||
|
||||
.addEdge(START, normalizationSetup.name)
|
||||
.addEdge(normalizationSetup.name, "normalisationModel")
|
||||
.addEdge("normalisationModel", "triggerEventModel")
|
||||
.addEdge("normalisationModel", triggerEventSetup.name)
|
||||
|
||||
.addEdge(triggerEventSetup.name, "triggerEventModel")
|
||||
|
||||
// @ts-expect-error
|
||||
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { GraphNode } from "@langchain/langgraph";
|
||||
import { MessagesState } from "../state";
|
||||
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
|
||||
import { rankFromCSV } from "../tools/clan/retreiveExamples";
|
||||
import { rankNormalizedClaims } from "../tools/retreiveExamples";
|
||||
|
||||
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||
let similarityResults = await rankFromCSV(state.disinformationTitle)
|
||||
let similarityResults = await rankNormalizedClaims(state.disinformationTitle)
|
||||
|
||||
let messages : BaseMessage[] = similarityResults.map((item) => {
|
||||
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
import { GraphNode } from "@langchain/langgraph";
|
||||
import { MessagesState } from "../state";
|
||||
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
|
||||
import { rankExampleTriggerEvents, rankNormalizedClaims } from "../tools/retreiveExamples";
|
||||
|
||||
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
|
||||
|
||||
//Now give in-context examples. hopwfully we can self-teach?
|
||||
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
|
||||
|
||||
let messages : BaseMessage[] = similarityResults.map((item) => {
|
||||
return new AIMessage(`Event: ${item.rawtext}, Claims and given scores: ${item.cleantext}`)
|
||||
})
|
||||
|
||||
return { messages: messages, disinformationTitle: state.disinformationTitle, normalizedClaim: nc };
|
||||
};
|
||||
@@ -9,6 +9,7 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
|
||||
|
||||
raw = raw.replace("###TITLE###", state.disinformationTitle);
|
||||
raw = raw.replace("###LM###", state.messages.at(-1).content);
|
||||
raw = raw.replace("###NTITLE###", state.normalizedClaim);
|
||||
|
||||
if (raw.indexOf("###TECLAIM###") != -1) {
|
||||
const title = state.proposedTriggerEvent[state.proposedTriggerEventIndex].Event
|
||||
|
||||
@@ -2,7 +2,7 @@ You are an agent in a pipeline to analyse disinformation.
|
||||
Once the information has been created as below, a dataset can be created to feed a model for prediction, which will improve pre-bunking efforts.
|
||||
|
||||
There is a false disinformation claim circulating:
|
||||
###LM###
|
||||
("###NTITLE###
|
||||
Produce up-to 5 specific "trigger events" that happened that could have led to the spread of this disinformation.
|
||||
|
||||
Remember the time frame of the disinformation campaign: {{CAMPAIGN_DATE}}
|
||||
@@ -24,4 +24,6 @@ Use your abilities to look between the lines and produce some insightful analysi
|
||||
|
||||
Events will be reordered as part of processing, each statement must stand alone
|
||||
|
||||
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given, to help understanding the intended quality of analysis
|
||||
|
||||
Lets go through it step by step
|
||||
@@ -21,5 +21,6 @@ export const MessagesState = new StateSchema({
|
||||
messages: MessagesValue,
|
||||
proposedTriggerEvent: ProposedTriggerEventArray,
|
||||
proposedTriggerEventIndex: z.int(),
|
||||
normalizedClaim: z.string(),
|
||||
});
|
||||
|
||||
|
||||
@@ -3,14 +3,17 @@ import fs from "fs";
|
||||
import { pipeline, cos_sim } from "@huggingface/transformers";
|
||||
import bm25Factory from "wink-bm25-text-search";
|
||||
import nlp from "wink-nlp-utils";
|
||||
import { logger } from "../../utils/logger";
|
||||
import { logger } from "../utils/logger";
|
||||
import readline from "readline";
|
||||
|
||||
const CSV_PATHS = [
|
||||
"./tools/clan/dev-eng.csv",
|
||||
"./tools/clan/train-eng.csv",
|
||||
"../data/dev-eng.csv",
|
||||
"../data/train-eng.csv",
|
||||
];
|
||||
|
||||
const CACHE_PATH = "./tools/clan/csv.cache.json";
|
||||
const CACHE_PATH = "../data/csv.cache.json";
|
||||
|
||||
const JSONL_PATH = "../data/results.jsonl"
|
||||
|
||||
type EmbeddingCache = {
|
||||
rawtexts: string[];
|
||||
@@ -36,6 +39,13 @@ let csvEmbeddings: number[][] = [];
|
||||
let csvBM25: any = null;
|
||||
let csvLoaded = false;
|
||||
|
||||
let jsonlRawtexts: string[] = [];
|
||||
let jsonlCleantexts: string[] = [];
|
||||
let jsonlEmbeddings: number[][] = [];
|
||||
let jsonlBM25: any = null;
|
||||
let jsonlLoaded = false;
|
||||
|
||||
|
||||
logger.info("Loading embedding model...");
|
||||
const featureExtractor = await pipeline(
|
||||
"feature-extraction",
|
||||
@@ -43,12 +53,12 @@ const featureExtractor = await pipeline(
|
||||
);
|
||||
logger.info("Embedding model loaded");
|
||||
|
||||
//Cached entrypoint
|
||||
export async function rankFromCSV(
|
||||
//Cached entrypoints
|
||||
export async function rankNormalizedClaims(
|
||||
query: string,
|
||||
topK = 5
|
||||
): Promise<RankedResult[]> {
|
||||
await ensureCSVLoaded();
|
||||
await ensureNormalizedClaimCSVLoaded();
|
||||
|
||||
logger.info("Ranking from CSV cache...");
|
||||
|
||||
@@ -78,6 +88,40 @@ export async function rankFromCSV(
|
||||
return ranked.slice(0, topK);
|
||||
}
|
||||
|
||||
export async function rankExampleTriggerEvents(
|
||||
query: string,
|
||||
topK = 5
|
||||
): Promise<RankedResult[]> {
|
||||
await ensureExampleClaimJsonlLoaded();
|
||||
|
||||
logger.info("Ranking from JSONL cache...");
|
||||
|
||||
const queryEmbedding = await embedText(query);
|
||||
|
||||
const denseScores = jsonlEmbeddings.map((docEmbedding) =>
|
||||
cos_sim(docEmbedding, queryEmbedding)
|
||||
);
|
||||
|
||||
const sparseScores = computeSparseScores(query, jsonlBM25, jsonlRawtexts);
|
||||
|
||||
const fusedScores = reciprocalRankFusion([denseScores, sparseScores]);
|
||||
|
||||
const ranked = jsonlRawtexts
|
||||
.map((text, i) => ({
|
||||
id: i,
|
||||
rawtext: text,
|
||||
cleantext: jsonlCleantexts[i],
|
||||
denseScore: denseScores[i],
|
||||
sparseScore: sparseScores[i],
|
||||
fusedScore: fusedScores[i],
|
||||
}))
|
||||
.sort((a, b) => b.fusedScore - a.fusedScore);
|
||||
|
||||
logger.info("Ranking complete (JSONL mode)");
|
||||
|
||||
return ranked.slice(0, topK);
|
||||
}
|
||||
|
||||
//Dynamic Entrypoint
|
||||
export async function rankDynamically(
|
||||
query: string,
|
||||
@@ -121,7 +165,7 @@ export async function rankDynamically(
|
||||
}
|
||||
|
||||
//CSV stuff
|
||||
async function ensureCSVLoaded(): Promise<void> {
|
||||
async function ensureNormalizedClaimCSVLoaded(): Promise<void> {
|
||||
if (csvLoaded) return;
|
||||
|
||||
logger.info("Initializing CSV ranking mode...");
|
||||
@@ -143,7 +187,7 @@ async function ensureCSVLoaded(): Promise<void> {
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const path of CSV_PATHS) {
|
||||
await processCSV(path, seen);
|
||||
await processNormalizationCSV(path, seen);
|
||||
}
|
||||
|
||||
const cache: EmbeddingCache = {
|
||||
@@ -162,7 +206,7 @@ async function ensureCSVLoaded(): Promise<void> {
|
||||
logger.info("CSV mode ready");
|
||||
}
|
||||
|
||||
async function processCSV(
|
||||
async function processNormalizationCSV(
|
||||
path: string,
|
||||
seen: Set<string>
|
||||
): Promise<void> {
|
||||
@@ -190,6 +234,40 @@ async function processCSV(
|
||||
logger.info("Finished CSV: %s", path);
|
||||
}
|
||||
|
||||
async function ensureExampleClaimJsonlLoaded(): Promise<void> {
|
||||
if (jsonlLoaded) return;
|
||||
|
||||
logger.info("Initializing JSONL ranking...");
|
||||
//TODO: make jsonl parsins
|
||||
const stream = fs.createReadStream(JSONL_PATH);
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input: stream,
|
||||
crlfDelay: Infinity,
|
||||
});
|
||||
|
||||
for await (const line of rl) {
|
||||
if (!line.trim()) continue; // skip empty lines
|
||||
|
||||
const row = JSON.parse(line);
|
||||
|
||||
const text = row.text;
|
||||
|
||||
const embedding = await embedText(text);
|
||||
|
||||
jsonlRawtexts.push(text);
|
||||
|
||||
jsonlCleantexts.push(row.output[0].content);
|
||||
jsonlEmbeddings.push(embedding);
|
||||
}
|
||||
|
||||
|
||||
jsonlBM25 = buildBM25(jsonlRawtexts);
|
||||
|
||||
jsonlLoaded = true;
|
||||
logger.info("JSONL ranking done");
|
||||
}
|
||||
|
||||
|
||||
async function embedText(text: string): Promise<number[]> {
|
||||
const output = await featureExtractor(text, {
|
||||
@@ -277,3 +355,6 @@ function reciprocalRankFusion(
|
||||
// ]
|
||||
// )
|
||||
// );
|
||||
|
||||
// await ensureExampleClaimJsonlLoaded()
|
||||
// console.log(await rankExampleTriggerEvents("Niger"))
|
||||
@@ -2,7 +2,7 @@ import { tool } from "@langchain/core/tools";
|
||||
import * as z from "zod";
|
||||
import { queryScraper } from "./webSearch";
|
||||
import { extractWebpageContent } from "./webpageFetch";
|
||||
import { rankDynamically } from "./clan/retreiveExamples";
|
||||
import { rankDynamically } from "./retreiveExamples";
|
||||
|
||||
|
||||
export async function rankAndDisplayData(data: string[], context: string):Promise<string> {
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
||||
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
run_agent () {
|
||||
echo "Starting LangGraph agent..."
|
||||
cd agent
|
||||
npx @langchain/langgraph-cli dev
|
||||
}
|
||||
|
||||
run_ragas_service () {
|
||||
echo "Starting RAGAS service..."
|
||||
cd "supporting/RAGAS_Service"
|
||||
.venv/bin/uvicorn ragas_service:app --port 8001
|
||||
}
|
||||
|
||||
run_frontend () {
|
||||
echo "Starting frontend (Streamlit)..."
|
||||
cd "supporting/scorer"
|
||||
.venv/bin/streamlit run display.py
|
||||
}
|
||||
|
||||
run_fetch () {
|
||||
echo "Running fetch job..."
|
||||
cd "supporting/dbkf"
|
||||
python fetch.py
|
||||
}
|
||||
|
||||
run_wrapper () {
|
||||
echo "Running wrapper..."
|
||||
cd "supporting/Wrapper"
|
||||
npm run dev
|
||||
}
|
||||
|
||||
run_analysis () {
|
||||
cd supporting/scorer
|
||||
python analyse.py
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
agent) run_agent ;;
|
||||
ragas_service) run_ragas_service ;;
|
||||
frontend) run_frontend ;;
|
||||
fetch) run_fetch ;;
|
||||
wrapper) run_wrapper ;;
|
||||
analysis) run_analysis ;;
|
||||
*)
|
||||
echo "Unknown command: $1"
|
||||
echo "Usage: ./runproject [agent|ragas_service|frontend|fetch|wrapper|analysis]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -1,7 +1,3 @@
|
||||
# -------- Ours --------
|
||||
claims.json
|
||||
results*.jsonl
|
||||
|
||||
# --------- Github -----------
|
||||
# Logs
|
||||
logs
|
||||
|
||||
@@ -5,8 +5,8 @@ import cliProgress from "cli-progress";
|
||||
import pLimit from "p-limit";
|
||||
|
||||
|
||||
const INPUT_FILE = "./claims.json";
|
||||
const OUTPUT_FILE = "./results.jsonl";
|
||||
const INPUT_FILE = "../../data/claims.json";
|
||||
const OUTPUT_FILE = "../../data/results.jsonl";
|
||||
const API_URL = "http://localhost:2024";
|
||||
const AGENT_NAME = "agent";
|
||||
const MAX_CONCURRENCY = 50;
|
||||
|
||||
@@ -13,16 +13,16 @@ DEFAULT_PARAMS = {
|
||||
"concept": "http://weverify.eu/resource/Concept/Q212",
|
||||
"documentTypes": "http://schema.org/Claim",
|
||||
"from": "2000-01-01",
|
||||
"to": "2023-10-17",
|
||||
"to": "2026-10-17",
|
||||
"lang": "en",
|
||||
"limit": 50, # Max per page
|
||||
"limit": 300, # Max per page
|
||||
"page": 1,
|
||||
"orderBy": "date"
|
||||
}
|
||||
|
||||
NUM_RANDOM_CLAIMS = 10
|
||||
|
||||
OUTPUT_FILE = "../Wrapper/claims.json"
|
||||
OUTPUT_FILE = "../../data/claims.json"
|
||||
|
||||
def fetch_claims(params=None):
|
||||
if params is None:
|
||||
|
||||
@@ -4,7 +4,7 @@ from statistics import mean
|
||||
# ------------------------------------------------------------
|
||||
# Load JSONL file
|
||||
# ------------------------------------------------------------
|
||||
DATA_FILE = "../Wrapper/results.jsonl"
|
||||
DATA_FILE = "../../data/results.jsonl"
|
||||
|
||||
data = []
|
||||
with open(DATA_FILE, "r", encoding="utf-8") as f:
|
||||
|
||||
@@ -4,7 +4,7 @@ import random
|
||||
from pathlib import Path
|
||||
|
||||
# Path to your JSONL file
|
||||
DATA_FILE = "../Wrapper/results.jsonl"
|
||||
DATA_FILE = "../../data/results.jsonl"
|
||||
|
||||
# --------------------------
|
||||
# Helper functions
|
||||
|
||||
Reference in New Issue
Block a user