Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e0bab9897 | |||
| c4dac3f515 | |||
| 2252a42466 | |||
| 75ca1032a6 | |||
| 00d129bd28 | |||
| cf923d6e87 | |||
| f821e9643d | |||
| 43ecd04135 | |||
| 8c0921057b | |||
| b610e8c989 | |||
| f8d4155b7c |
@@ -1,9 +1,22 @@
|
|||||||
# AI models for identifying trigger events in disinformation analysis
|
# AI models for identifying trigger events in disinformation analysis
|
||||||
Final Dissertation Submission Repository
|
Final Dissertation Submission Repository
|
||||||
|
|
||||||
## Project Description
|
## Abstract
|
||||||
-- todo --
|
-- todo --
|
||||||
|
|
||||||
|
[Project Presentation](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/presentation)
|
||||||
|
|
||||||
|
## Generated Database Link and Usage Experiments
|
||||||
|
Generated Dataset Link: [https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset](https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset)
|
||||||
|
|
||||||
|
Graph-Based Dataset Visualisation: [https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/)
|
||||||
|
|
||||||
|
Usage Experiments (incl graph visualisation) Source Code: [https://github.com/WillJeynes/LLMsForDisinformationPrediction](https://github.com/WillJeynes/LLMsForDisinformationPrediction)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# This repository:
|
||||||
|
|
||||||
## Solution Diagram
|
## Solution Diagram
|
||||||
-- todo --
|
-- todo --
|
||||||
|
|
||||||
@@ -13,8 +26,6 @@ Final Dissertation Submission Repository
|
|||||||
## Agent Refinement
|
## Agent Refinement
|
||||||
[See agent](/agent/)
|
[See agent](/agent/)
|
||||||
|
|
||||||
## Generated Database Link and Usage Experiments
|
|
||||||
-- todo --
|
|
||||||
|
|
||||||
## Repository Structure
|
## Repository Structure
|
||||||
```
|
```
|
||||||
|
|||||||
+30
-1
@@ -1,3 +1,32 @@
|
|||||||
## Refining the agent output
|
## Refining the agent output
|
||||||
|
|
||||||
TODO: Table and document experiments
|
Experiments modifying pipeline
|
||||||
|
|
||||||
|
| Model | % Correct | % Change |
|
||||||
|
|------------------|----------:|---------:|
|
||||||
|
| BASELINE | 33 | 0 |
|
||||||
|
| Improv Prompt | 39.96 | 0.21 |
|
||||||
|
| Add Examples | 44.67 | 0.35 |
|
||||||
|
| Date | 45.51 | 0.38 |
|
||||||
|
| Chain of Thought | 43.38 | 0.31 |
|
||||||
|
| Self-Critique | 44.36 | 0.34 |
|
||||||
|
|
||||||
|
Experiments with different model types:
|
||||||
|
| Model | % Correct | % Change |
|
||||||
|
|-------------------------------|----------:|---------:|
|
||||||
|
| gpt-5-mini | 45.51 | |
|
||||||
|
| gpt-5.4-mini | 32.4 | |
|
||||||
|
| gpt-5.4-nano | 23.28 | |
|
||||||
|
| gpt-4.1-mini | 27.85 | |
|
||||||
|
| gpt-4o-mini | 32.47 | |
|
||||||
|
| llama3.1:8b-instruct-q4_K_M | ? | |
|
||||||
|
| qwen3.5:9b | 0 | |
|
||||||
|
|
||||||
|
%age valid URLS
|
||||||
|
| Model | Number | % Age |
|
||||||
|
|-------------------------------|----------:|---------:|
|
||||||
|
| gpt-5-mini | 22/405 | 5.43 |
|
||||||
|
| gpt-5.4-mini | 29/278 | 10.43 |
|
||||||
|
| gpt-5.4-nano | 6/210 | 2.85 |
|
||||||
|
| gpt-4.1-mini | 15/269 | 5.57 |
|
||||||
|
| gpt-4o-mini | 27/287 | 9.407 |
|
||||||
+2
-20
@@ -11,18 +11,13 @@ import { loopEndConditional } from "./conditionals/loop_end";
|
|||||||
import { sort } from "./nodes/sort";
|
import { sort } from "./nodes/sort";
|
||||||
import { triggerEventSetup } from "./nodes/triggerEventSetup";
|
import { triggerEventSetup } from "./nodes/triggerEventSetup";
|
||||||
import { createEnsembleNode } from "./nodes/ensembleNode";
|
import { createEnsembleNode } from "./nodes/ensembleNode";
|
||||||
import { selfEvalSetup } from "./nodes/selfEvalSetup";
|
|
||||||
|
|
||||||
const triggerEventToolNode = createToolNode(triggerEventToolsByName);
|
const triggerEventToolNode = createToolNode(triggerEventToolsByName);
|
||||||
const peToolNode = createToolNode(triggerEventToolsByName);
|
|
||||||
|
|
||||||
const normalisationModel = createModelNode([], "normalization.txt");
|
const normalisationModel = createModelNode([], "normalization.txt");
|
||||||
const triggerEventModel = createModelNode(triggerEventToolsByName, "trigger.txt");
|
const triggerEventModel = createModelNode(triggerEventToolsByName, "trigger.txt");
|
||||||
const evaluationModel = createModelNode([], "eval.txt");
|
|
||||||
const peModel = createModelNode(triggerEventToolsByName, "posteval.txt");
|
|
||||||
|
|
||||||
const triggerEventToolConditional = createToolConditional("triggerEventToolNode", selfEvalSetup.name);
|
const triggerEventToolConditional = createToolConditional("triggerEventToolNode", verificationSetup.name);
|
||||||
const peToolConditional = createToolConditional("peToolNode", verificationSetup.name);
|
|
||||||
|
|
||||||
const roNode = createEnsembleNode("ROBERTA", "roberta");
|
const roNode = createEnsembleNode("ROBERTA", "roberta");
|
||||||
const flNode = createEnsembleNode("FLAN", "flan");
|
const flNode = createEnsembleNode("FLAN", "flan");
|
||||||
@@ -38,12 +33,6 @@ const agent = new StateGraph(MessagesState)
|
|||||||
.addNode("triggerEventToolNode", triggerEventToolNode)
|
.addNode("triggerEventToolNode", triggerEventToolNode)
|
||||||
.addNode("triggerEventModel", triggerEventModel)
|
.addNode("triggerEventModel", triggerEventModel)
|
||||||
|
|
||||||
.addNode(selfEvalSetup.name, selfEvalSetup)
|
|
||||||
.addNode("evaluationModel", evaluationModel)
|
|
||||||
|
|
||||||
.addNode("peToolNode", peToolNode)
|
|
||||||
.addNode("peModel", peModel)
|
|
||||||
|
|
||||||
.addNode(verificationSetup.name, verificationSetup)
|
.addNode(verificationSetup.name, verificationSetup)
|
||||||
|
|
||||||
.addNode("roNode", roNode)
|
.addNode("roNode", roNode)
|
||||||
@@ -60,16 +49,9 @@ const agent = new StateGraph(MessagesState)
|
|||||||
.addEdge(triggerEventSetup.name, "triggerEventModel")
|
.addEdge(triggerEventSetup.name, "triggerEventModel")
|
||||||
|
|
||||||
// @ts-expect-error
|
// @ts-expect-error
|
||||||
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", selfEvalSetup.name])
|
.addConditionalEdges("triggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
|
||||||
.addEdge("triggerEventToolNode", "triggerEventModel")
|
.addEdge("triggerEventToolNode", "triggerEventModel")
|
||||||
|
|
||||||
.addEdge(selfEvalSetup.name, "evaluationModel")
|
|
||||||
.addEdge("evaluationModel", "peModel")
|
|
||||||
|
|
||||||
// @ts-expect-error
|
|
||||||
.addConditionalEdges("peModel", peToolConditional, ["peToolNode", verificationSetup.name])
|
|
||||||
.addEdge("peToolNode", "peModel")
|
|
||||||
|
|
||||||
.addEdge(verificationSetup.name, "roNode")
|
.addEdge(verificationSetup.name, "roNode")
|
||||||
.addEdge(verificationSetup.name, "flNode")
|
.addEdge(verificationSetup.name, "flNode")
|
||||||
.addEdge(verificationSetup.name, "lrNode")
|
.addEdge(verificationSetup.name, "lrNode")
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
import { GraphNode } from "@langchain/langgraph";
|
|
||||||
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
|
||||||
import { logger } from "../utils/logger";
|
|
||||||
import { queryScraper } from "../tools/webSearch";
|
|
||||||
import { rankAndDisplayData } from "../tools/triggerEventTools";
|
|
||||||
|
|
||||||
export const selfEvalSetup: GraphNode<typeof MessagesState> = async (state) => {
|
|
||||||
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
|
||||||
const parsed = ProposedTriggerEventArray.parse(JSON.parse(genResponse));
|
|
||||||
|
|
||||||
for (let i = 0; i < parsed.length; i++) {
|
|
||||||
const search = parsed[i].SearchQuery
|
|
||||||
const data = await queryScraper(search);
|
|
||||||
const output = await rankAndDisplayData(data, search);
|
|
||||||
|
|
||||||
parsed[i].context = output;
|
|
||||||
}
|
|
||||||
|
|
||||||
return { evalTriggerEvent: parsed };
|
|
||||||
|
|
||||||
};
|
|
||||||
@@ -1,8 +1,7 @@
|
|||||||
import { GraphNode } from "@langchain/langgraph";
|
import { GraphNode } from "@langchain/langgraph";
|
||||||
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
||||||
import { logger } from "../utils/logger";
|
import { logger } from "../utils/logger";
|
||||||
import { queryScraper } from "../tools/webSearch";
|
import { jsonrepair } from 'jsonrepair'
|
||||||
import { rankAndDisplayData } from "../tools/triggerEventTools";
|
|
||||||
|
|
||||||
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||||
//this is kinda doing two things, but having two nodes for it seems overkill
|
//this is kinda doing two things, but having two nodes for it seems overkill
|
||||||
@@ -11,7 +10,30 @@ export const verificationSetup: GraphNode<typeof MessagesState> = async (state)
|
|||||||
logger.warn("No trigger events in memory, parsing")
|
logger.warn("No trigger events in memory, parsing")
|
||||||
|
|
||||||
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
||||||
const parsed = ProposedTriggerEventArray.parse(JSON.parse(genResponse));
|
|
||||||
|
const repaired = jsonrepair(genResponse);
|
||||||
|
|
||||||
|
let parsed;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const json = JSON.parse(repaired);
|
||||||
|
|
||||||
|
if (Array.isArray(json)) {
|
||||||
|
parsed = ProposedTriggerEventArray.parse(json);
|
||||||
|
} else {
|
||||||
|
// try grab first value
|
||||||
|
const firstValue = Object.values(json)[0];
|
||||||
|
|
||||||
|
if (Array.isArray(firstValue)) {
|
||||||
|
parsed = ProposedTriggerEventArray.parse(firstValue);
|
||||||
|
} else {
|
||||||
|
throw new Error("No array found in JSON");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
logger.error(`Failed to parse LLM response: ${err.message}`);
|
||||||
|
throw new Error(`Failed to parse LLM response: ${err}`);
|
||||||
|
}
|
||||||
|
|
||||||
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
||||||
}
|
}
|
||||||
|
|||||||
Generated
+10
@@ -20,6 +20,7 @@
|
|||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"exponential-backoff": "^3.1.3",
|
"exponential-backoff": "^3.1.3",
|
||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
|
"jsonrepair": "^3.13.3",
|
||||||
"langchain": "^1.2.14",
|
"langchain": "^1.2.14",
|
||||||
"selenium-webdriver": "^4.40.0",
|
"selenium-webdriver": "^4.40.0",
|
||||||
"tldts": "^7.0.23",
|
"tldts": "^7.0.23",
|
||||||
@@ -2075,6 +2076,15 @@
|
|||||||
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
|
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
|
||||||
"license": "ISC"
|
"license": "ISC"
|
||||||
},
|
},
|
||||||
|
"node_modules/jsonrepair": {
|
||||||
|
"version": "3.13.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.3.tgz",
|
||||||
|
"integrity": "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ==",
|
||||||
|
"license": "ISC",
|
||||||
|
"bin": {
|
||||||
|
"jsonrepair": "bin/cli.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/jszip": {
|
"node_modules/jszip": {
|
||||||
"version": "3.10.1",
|
"version": "3.10.1",
|
||||||
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
|
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@
|
|||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"exponential-backoff": "^3.1.3",
|
"exponential-backoff": "^3.1.3",
|
||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
|
"jsonrepair": "^3.13.3",
|
||||||
"langchain": "^1.2.14",
|
"langchain": "^1.2.14",
|
||||||
"selenium-webdriver": "^4.40.0",
|
"selenium-webdriver": "^4.40.0",
|
||||||
"tldts": "^7.0.23",
|
"tldts": "^7.0.23",
|
||||||
|
|||||||
@@ -1,30 +0,0 @@
|
|||||||
You are an impartial and meticulous evaluator assessing LLM’s response based on key quality dimensions
|
|
||||||
of honesty and usefulness. Your goal is to provide structured feedback that can be used to improve the
|
|
||||||
response.
|
|
||||||
Evaluation task: please follow these steps carefully:
|
|
||||||
1. Analyze the response based on the three dimensions below.
|
|
||||||
2. Provide justifications first: write a brief explanation justifying your assessment for each dimension.
|
|
||||||
3. Assign scores after justification: assign a score from 1 (poor) to 10 (excellent) for each dimension
|
|
||||||
based on your justification.
|
|
||||||
4. Synthesize: provide a brief overall impression and the single most important suggestion for improve-
|
|
||||||
ment, keeping in mind that explanation/honesty is the top priority, then followed by guidance.
|
|
||||||
Critique dimensions (evaluate in this order):
|
|
||||||
(1) Speficicity and usefullness: Can the proposed event be used to create a dataset of concrete events mapped to later
|
|
||||||
disinformation.
|
|
||||||
(2) Existance: Using the context provided, can the user be certain that the proposed trigger event actually happened
|
|
||||||
(3) Causality: Is there a possible link from the proposed trigger event to the disinformaiton at hand
|
|
||||||
Overall impression & key improvement suggestion: Briefly summarize the overall quality and state the
|
|
||||||
most critical change needed to improve the response.
|
|
||||||
|
|
||||||
Disinformation query:
|
|
||||||
###NTITLE###
|
|
||||||
Disinformation date:
|
|
||||||
###CDATE###
|
|
||||||
|
|
||||||
LLM’s response to evaluate:
|
|
||||||
###LM###
|
|
||||||
|
|
||||||
Provided context:
|
|
||||||
###VESEARCHES###
|
|
||||||
|
|
||||||
Let's think it through step by step
|
|
||||||
@@ -15,10 +15,6 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
|
|||||||
raw = raw.replace("###LM###", state.messages.at(-1).content);
|
raw = raw.replace("###LM###", state.messages.at(-1).content);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (raw.indexOf("###L2M###") != -1) {
|
|
||||||
raw = raw.replace("###L2M###", state.messages.at(-2).content);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (raw.indexOf("###NTITLE###") != -1) {
|
if (raw.indexOf("###NTITLE###") != -1) {
|
||||||
raw = raw.replace("###NTITLE###", state.normalizedClaim);
|
raw = raw.replace("###NTITLE###", state.normalizedClaim);
|
||||||
}
|
}
|
||||||
@@ -37,12 +33,5 @@ export async function hydratePrompt(path: string, state: any) : Promise<string>
|
|||||||
raw = raw.replace("###TESEARCH###", output)
|
raw = raw.replace("###TESEARCH###", output)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (raw.indexOf("###VESEARCHES###") != -1) {
|
|
||||||
const output = state.evalTriggerEvent
|
|
||||||
.map(e => e.context)
|
|
||||||
.join("\n")
|
|
||||||
raw = raw.replace("###VESEARCHES###", output)
|
|
||||||
}
|
|
||||||
|
|
||||||
return raw;
|
return raw;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,40 +0,0 @@
|
|||||||
You are an expert editor tasked with making targeted improvements to an existing LLM’s response based
|
|
||||||
on a specific critique with the primary goal of enhancing its score according to evaluation standards while
|
|
||||||
preserving its strengths.
|
|
||||||
Your revision task: generate a revised version of the existing response. Your goal is not to rewrite it
|
|
||||||
completely, but to make precise edits only to address the specific weaknesses highlighted in the critique.
|
|
||||||
Instructions for editing:
|
|
||||||
- Identify specific flaws: carefully read the critique and pinpoint the exact issues raised (e.g., unclear
|
|
||||||
explanation, vagueness, inappropriate responses, the key suggestion).
|
|
||||||
- Perform minimal targeted edits: modify only the necessary sentences or paragraphs within the existing
|
|
||||||
response to directly fix these identified flaws.
|
|
||||||
- Strongly preserve strengths: crucially keep all other parts of the existing response intact. Do not
|
|
||||||
rephrase, restructure, or remove sections that were not criticized or likely contributed positively to its
|
|
||||||
initial score.
|
|
||||||
- Ensure coherence: verify that your targeted edits integrate smoothly and do not introduce contradictions
|
|
||||||
or awkward phrasing.
|
|
||||||
Output requirements:
|
|
||||||
- It should feel like a slightly polished or corrected version of the existing response, not a fundamentally
|
|
||||||
different answer.
|
|
||||||
- Do not mention the critique, scores, or the editing process. The output should be clean json that passes validation checks
|
|
||||||
|
|
||||||
Again, use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url,Date".
|
|
||||||
Use tools available to you if further information is required
|
|
||||||
|
|
||||||
Add no new events, only improve the existing items
|
|
||||||
|
|
||||||
Disinformation query:
|
|
||||||
###NTITLE###
|
|
||||||
Disinformation date:
|
|
||||||
###CDATE###
|
|
||||||
|
|
||||||
LLM’s response to improve:
|
|
||||||
###L2M###
|
|
||||||
|
|
||||||
Citique:
|
|
||||||
###LM###
|
|
||||||
|
|
||||||
This contains specific feedback, justifications, scores from 1 to 10, and potentially a key improvement
|
|
||||||
suggestion. Focus on the justifications for low scores and the key suggestion.
|
|
||||||
|
|
||||||
Let's think it through step by step
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
Could the following real-world event:
|
|
||||||
###TECLAIM###
|
|
||||||
|
|
||||||
Be a trigger for the following disinformation:
|
|
||||||
###TITLE###
|
|
||||||
|
|
||||||
Respond with "RELATION", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
|
||||||
|
|
||||||
Ignore wether the event happened or not, purely consider the likiness of causation
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
Do the search results cited below
|
|
||||||
###TESEARCH###
|
|
||||||
Support the idea that the following happened:
|
|
||||||
###TECLAIM###
|
|
||||||
|
|
||||||
Respond with "CONFIDENCE", followed by : followed by a confidence score (VERYHIGH, HIGH, MEDIUM, LOW, VERYLOW) followed by : followed by the reason. Use no other words, just return the score and reason in format.
|
|
||||||
|
|
||||||
Dates can be off by a few days, that would still be valid
|
|
||||||
@@ -21,7 +21,6 @@ export const MessagesState = new StateSchema({
|
|||||||
date: z.string(),
|
date: z.string(),
|
||||||
messages: MessagesValue,
|
messages: MessagesValue,
|
||||||
proposedTriggerEvent: ProposedTriggerEventArray,
|
proposedTriggerEvent: ProposedTriggerEventArray,
|
||||||
evalTriggerEvent: ProposedTriggerEventArray,
|
|
||||||
proposedTriggerEventIndex: z.int(),
|
proposedTriggerEventIndex: z.int(),
|
||||||
normalizedClaim: z.string(),
|
normalizedClaim: z.string(),
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ set -e
|
|||||||
run_agent () {
|
run_agent () {
|
||||||
echo "Starting LangGraph agent..."
|
echo "Starting LangGraph agent..."
|
||||||
cd agent
|
cd agent
|
||||||
npx @langchain/langgraph-cli dev
|
npx @langchain/langgraph-cli@1.1.17 dev
|
||||||
}
|
}
|
||||||
|
|
||||||
run_ensemble_service () {
|
run_ensemble_service () {
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ datasets
|
|||||||
# ROBERTA
|
# ROBERTA
|
||||||
scikit-learn
|
scikit-learn
|
||||||
transformers[torch]
|
transformers[torch]
|
||||||
|
sentence_transformers
|
||||||
|
|
||||||
# Utils
|
# Utils
|
||||||
numpy
|
numpy
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ const MODE = process.env.MODE ?? "claim";
|
|||||||
|
|
||||||
const MAX_CONCURRENCY = 5;
|
const MAX_CONCURRENCY = 5;
|
||||||
|
|
||||||
|
const OFFSET = parseInt(process.env.OFFSET ?? "0", 10);
|
||||||
|
const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null;
|
||||||
|
|
||||||
const client = new Client({ apiUrl: API_URL });
|
const client = new Client({ apiUrl: API_URL });
|
||||||
|
|
||||||
|
|
||||||
@@ -164,9 +167,18 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
|||||||
async function main() {
|
async function main() {
|
||||||
console.log("Reading input file...");
|
console.log("Reading input file...");
|
||||||
|
|
||||||
const records = await loadInputs();
|
const allRecords = await loadInputs();
|
||||||
|
|
||||||
console.log(`Loaded ${records.length} records`);
|
console.log(`Loaded ${allRecords.length} records`);
|
||||||
|
|
||||||
|
const records = allRecords.slice(
|
||||||
|
OFFSET,
|
||||||
|
LIMIT !== null ? OFFSET + LIMIT : undefined
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})`
|
||||||
|
);
|
||||||
|
|
||||||
fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });
|
fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def init_driver():
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
options.add_argument("--disable-gpu")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
options.add_argument("--headless")
|
||||||
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
options.add_argument("--window-size=1920,1080")
|
||||||
|
prefs = {
|
||||||
|
"profile.managed_default_content_settings.images": 2, # block images
|
||||||
|
"profile.default_content_setting_values.stylesheets": 2, # block CSS
|
||||||
|
"profile.managed_default_content_settings.cookies": 2, # optional
|
||||||
|
}
|
||||||
|
options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
driver.set_page_load_timeout(30)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def is_root_url(url):
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return parsed.path in ("", "/")
|
||||||
|
|
||||||
|
def is_404_page(driver):
|
||||||
|
"""Safely check for 404, handling stale elements."""
|
||||||
|
try:
|
||||||
|
title = driver.title.lower()
|
||||||
|
body_text = driver.find_element("tag name", "body").text.lower()
|
||||||
|
return "404" in title or "404" in body_text
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_url_selenium(url):
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
driver = init_driver()
|
||||||
|
driver.get(url)
|
||||||
|
# 404 check
|
||||||
|
if is_404_page(driver):
|
||||||
|
return False, "404 page detected"
|
||||||
|
# Root URL after redirects
|
||||||
|
final_url = driver.current_url
|
||||||
|
if is_root_url(final_url):
|
||||||
|
return False, f"Redirected to root URL ({final_url})"
|
||||||
|
return True, None
|
||||||
|
except (WebDriverException, TimeoutException) as e:
|
||||||
|
return False, str(e)
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
def process_event(event):
|
||||||
|
"""Process an event only if score > 0.4."""
|
||||||
|
score = event.get("score", 0)
|
||||||
|
if score <= 0.4:
|
||||||
|
return None, False, "Score too low"
|
||||||
|
url = event.get("Url")
|
||||||
|
if not url:
|
||||||
|
return None, False, "No URL"
|
||||||
|
is_valid, error_msg = check_url_selenium(url)
|
||||||
|
event["url_valid"] = is_valid
|
||||||
|
return url, is_valid, error_msg
|
||||||
|
|
||||||
|
def process_jsonl_file(file_path, max_workers=4):
|
||||||
|
invalid_urls = []
|
||||||
|
valid_urls = 0
|
||||||
|
|
||||||
|
# Gather events with score > 0.4
|
||||||
|
urls_to_check = []
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line_data = json.loads(line)
|
||||||
|
if line_data.get("status") != "success":
|
||||||
|
continue
|
||||||
|
for event in line_data.get("events", []):
|
||||||
|
if event.get("score", 0) > 0.4:
|
||||||
|
urls_to_check.append(event)
|
||||||
|
|
||||||
|
total_urls = len(urls_to_check)
|
||||||
|
|
||||||
|
# ThreadPoolExecutor with tqdm progress bar
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
||||||
|
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
||||||
|
url, is_valid, error_msg = future.result()
|
||||||
|
if not is_valid and url:
|
||||||
|
invalid_urls.append((url, error_msg))
|
||||||
|
else:
|
||||||
|
valid_urls += 1
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
if invalid_urls:
|
||||||
|
print("\nList of invalid URLs and reasons:")
|
||||||
|
for url, err in invalid_urls:
|
||||||
|
print(f"{url} --> {err}")
|
||||||
|
print("\n=== URL Validation Summary ===")
|
||||||
|
print(f"Total URLs processed: {total_urls}")
|
||||||
|
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||||
|
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||||
|
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||||
|
parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
process_jsonl_file(args.file_path, max_workers=args.workers)
|
||||||
@@ -27,7 +27,7 @@ DEFAULT_PARAMS = [
|
|||||||
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
|
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
|
||||||
]
|
]
|
||||||
|
|
||||||
NUM_RANDOM_CLAIMS = 200
|
NUM_RANDOM_CLAIMS = 2000
|
||||||
|
|
||||||
INPUT_FILE = "../../data/input.jsonl"
|
INPUT_FILE = "../../data/input.jsonl"
|
||||||
OUTPUT_FILE = "../../data/claims.json"
|
OUTPUT_FILE = "../../data/claims.json"
|
||||||
|
|||||||
Reference in New Issue
Block a user