Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e0bab9897 | |||
| c4dac3f515 | |||
| 2252a42466 | |||
| 75ca1032a6 | |||
| 00d129bd28 | |||
| cf923d6e87 | |||
| f821e9643d | |||
| 43ecd04135 | |||
| 8c0921057b |
@@ -1,9 +1,22 @@
|
|||||||
# AI models for identifying trigger events in disinformation analysis
|
# AI models for identifying trigger events in disinformation analysis
|
||||||
Final Dissertation Submission Repository
|
Final Dissertation Submission Repository
|
||||||
|
|
||||||
## Project Description
|
## Abstract
|
||||||
-- todo --
|
-- todo --
|
||||||
|
|
||||||
|
[Project Presentation](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/presentation)
|
||||||
|
|
||||||
|
## Generated Database Link and Usage Experiments
|
||||||
|
Generated Dataset Link: [https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset](https://huggingface.co/datasets/WillJeynes/LLMsForDisinformationAnalysis-Dataset)
|
||||||
|
|
||||||
|
Graph-Based Dataset Visualisation: [https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/](https://jillweynes.github.io/LLMsForDisinformationPrediction-GraphVizBuilt/)
|
||||||
|
|
||||||
|
Usage Experiments (incl graph visualisation) Source Code: [https://github.com/WillJeynes/LLMsForDisinformationPrediction](https://github.com/WillJeynes/LLMsForDisinformationPrediction)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# This repository:
|
||||||
|
|
||||||
## Solution Diagram
|
## Solution Diagram
|
||||||
-- todo --
|
-- todo --
|
||||||
|
|
||||||
@@ -13,8 +26,6 @@ Final Dissertation Submission Repository
|
|||||||
## Agent Refinement
|
## Agent Refinement
|
||||||
[See agent](/agent/)
|
[See agent](/agent/)
|
||||||
|
|
||||||
## Generated Database Link and Usage Experiments
|
|
||||||
-- todo --
|
|
||||||
|
|
||||||
## Repository Structure
|
## Repository Structure
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -4,4 +4,3 @@ LANGSMITH_API_KEY=123456
|
|||||||
LANGSMITH_ENDPOINT=https://eu.api.smith.langchain.com
|
LANGSMITH_ENDPOINT=https://eu.api.smith.langchain.com
|
||||||
SCRAPER_INSTANCE=https://example.com
|
SCRAPER_INSTANCE=https://example.com
|
||||||
SCRAPER_PARAM_ANYTHING=else
|
SCRAPER_PARAM_ANYTHING=else
|
||||||
RANKING_URL=http://localhost:8000/evaluate
|
|
||||||
+30
-1
@@ -1,3 +1,32 @@
|
|||||||
## Refining the agent output
|
## Refining the agent output
|
||||||
|
|
||||||
TODO: Table and document experiments
|
Experiments modifying pipeline
|
||||||
|
|
||||||
|
| Model | % Correct | % Change |
|
||||||
|
|------------------|----------:|---------:|
|
||||||
|
| BASELINE | 33 | 0 |
|
||||||
|
| Improv Prompt | 39.96 | 0.21 |
|
||||||
|
| Add Examples | 44.67 | 0.35 |
|
||||||
|
| Date | 45.51 | 0.38 |
|
||||||
|
| Chain of Thought | 43.38 | 0.31 |
|
||||||
|
| Self-Critique | 44.36 | 0.34 |
|
||||||
|
|
||||||
|
Experiments with different model types:
|
||||||
|
| Model | % Correct | % Change |
|
||||||
|
|-------------------------------|----------:|---------:|
|
||||||
|
| gpt-5-mini | 45.51 | |
|
||||||
|
| gpt-5.4-mini | 32.4 | |
|
||||||
|
| gpt-5.4-nano | 23.28 | |
|
||||||
|
| gpt-4.1-mini | 27.85 | |
|
||||||
|
| gpt-4o-mini | 32.47 | |
|
||||||
|
| llama3.1:8b-instruct-q4_K_M | ? | |
|
||||||
|
| qwen3.5:9b | 0 | |
|
||||||
|
|
||||||
|
%age valid URLS
|
||||||
|
| Model | Number | % Age |
|
||||||
|
|-------------------------------|----------:|---------:|
|
||||||
|
| gpt-5-mini | 22/405 | 5.43 |
|
||||||
|
| gpt-5.4-mini | 29/278 | 10.43 |
|
||||||
|
| gpt-5.4-nano | 6/210 | 2.85 |
|
||||||
|
| gpt-4.1-mini | 15/269 | 5.57 |
|
||||||
|
| gpt-4o-mini | 27/287 | 9.407 |
|
||||||
+7
-10
@@ -1,28 +1,25 @@
|
|||||||
import { SystemMessage } from "@langchain/core/messages";
|
import { HumanMessage, SystemMessage } from "@langchain/core/messages";
|
||||||
import { GraphNode } from "@langchain/langgraph";
|
import { GraphNode } from "@langchain/langgraph";
|
||||||
import { MessagesState } from "../state";
|
import { MessagesState } from "../state";
|
||||||
import { ChatOllama } from "@langchain/ollama";
|
import { ChatOpenAI } from "@langchain/openai"
|
||||||
import { hydratePrompt } from "../prompts/hydratePrompt";
|
import { hydratePrompt } from "../prompts/hydratePrompt";
|
||||||
import { logger } from "../utils/logger";
|
|
||||||
|
|
||||||
export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
|
export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
|
||||||
return async (state) => {
|
return async (state) => {
|
||||||
const sysPrompt = await hydratePrompt(promptPath, state);
|
const sysPrompt = await hydratePrompt(promptPath, state);
|
||||||
|
|
||||||
const model = new ChatOllama({
|
const model = new ChatOpenAI({
|
||||||
model: "llama3.1:8b-instruct-q4_K_M",
|
model: "gpt-5-mini"
|
||||||
temperature: 0.3
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const modelWithTools = model.bindTools(Object.values(tools));
|
const modelWithTools = model.bindTools(Object.values(tools));
|
||||||
|
|
||||||
const response = await modelWithTools.invoke([
|
const response = await modelWithTools.invoke([
|
||||||
new SystemMessage(sysPrompt),
|
new SystemMessage(
|
||||||
|
sysPrompt
|
||||||
|
),
|
||||||
...state.messages,
|
...state.messages,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
logger.error(response);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
messages: [response]
|
messages: [response]
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,16 +3,8 @@ import { MessagesState } from "../state";
|
|||||||
import { AIMessage, BaseMessage } from "@langchain/core/messages";
|
import { AIMessage, BaseMessage } from "@langchain/core/messages";
|
||||||
import { rankExampleTriggerEvents } from "../tools/retreiveExamples";
|
import { rankExampleTriggerEvents } from "../tools/retreiveExamples";
|
||||||
|
|
||||||
function extractTE(text: string) {
|
|
||||||
const match = text.match(/<norm>([\s\S]*?)<\/norm>/);
|
|
||||||
if (!match) throw new Error("Nothing found between <norm> tags");
|
|
||||||
return match[1].trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
|
export const triggerEventSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||||
let raw = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
|
let nc = state?.messages?.at(-1)?.content ?? "" //keep a copy of normalized trigger event. Again two things, womp womp
|
||||||
let nc = extractTE(raw.toString())
|
|
||||||
|
|
||||||
//Now give in-context examples. hopwfully we can self-teach?
|
//Now give in-context examples. hopwfully we can self-teach?
|
||||||
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
|
let similarityResults = await rankExampleTriggerEvents(state.disinformationTitle)
|
||||||
|
|||||||
@@ -1,31 +1,20 @@
|
|||||||
import { GraphNode } from "@langchain/langgraph";
|
import { GraphNode } from "@langchain/langgraph";
|
||||||
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
import { MessagesState, ProposedTriggerEventArray } from "../state";
|
||||||
import { logger } from "../utils/logger";
|
import { logger } from "../utils/logger";
|
||||||
import { jsonrepair } from 'jsonrepair';
|
import { jsonrepair } from 'jsonrepair'
|
||||||
|
|
||||||
function extractJSON(text: string) {
|
|
||||||
const match = text.match(/<json>([\s\S]*?)<\/json>/);
|
|
||||||
if (!match) throw new Error("No JSON found between <json> tags");
|
|
||||||
return match[1].trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
export const verificationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||||
|
//this is kinda doing two things, but having two nodes for it seems overkill
|
||||||
|
|
||||||
if (state.proposedTriggerEvent == undefined) {
|
if (state.proposedTriggerEvent == undefined) {
|
||||||
logger.warn("No trigger events in memory, parsing");
|
logger.warn("No trigger events in memory, parsing")
|
||||||
|
|
||||||
const genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
let genResponse = state.messages.at(-1)?.content.toString() ?? "";
|
||||||
|
|
||||||
let repaired: string;
|
const repaired = jsonrepair(genResponse);
|
||||||
try {
|
|
||||||
let extracted = extractJSON(genResponse)
|
|
||||||
repaired = jsonrepair(extracted);
|
|
||||||
} catch (repairErr: any) {
|
|
||||||
logger.error("Failed to repair JSON from LLM response.");
|
|
||||||
logger.error("Original LLM response:\n%s", genResponse);
|
|
||||||
throw new Error(`JSON repair failed: ${repairErr.message}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed;
|
let parsed;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const json = JSON.parse(repaired);
|
const json = JSON.parse(repaired);
|
||||||
|
|
||||||
@@ -38,23 +27,19 @@ export const verificationSetup: GraphNode<typeof MessagesState> = async (state)
|
|||||||
if (Array.isArray(firstValue)) {
|
if (Array.isArray(firstValue)) {
|
||||||
parsed = ProposedTriggerEventArray.parse(firstValue);
|
parsed = ProposedTriggerEventArray.parse(firstValue);
|
||||||
} else {
|
} else {
|
||||||
logger.error("No array found in JSON after parsing.");
|
throw new Error("No array found in JSON");
|
||||||
logger.error("Repaired JSON:\n%s", repaired);
|
|
||||||
logger.error("Original LLM response:\n%s", genResponse);
|
|
||||||
throw new Error("No array found in JSON structure");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (parseErr: any) {
|
} catch (err: any) {
|
||||||
logger.error("Failed to parse LLM response to JSON or validate array.");
|
logger.error(`Failed to parse LLM response: ${err.message}`);
|
||||||
logger.error("Repaired JSON:\n%s", repaired);
|
throw new Error(`Failed to parse LLM response: ${err}`);
|
||||||
logger.error("Original LLM response:\n%s", genResponse);
|
|
||||||
throw new Error(`Parsing failed: ${parseErr.message}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
return { proposedTriggerEvent: parsed, proposedTriggerEventIndex: 0 };
|
||||||
} else {
|
}
|
||||||
logger.info("Trigger event index %s", state.proposedTriggerEventIndex + 1);
|
else {
|
||||||
|
logger.info("Trigger event index %s", state.proposedTriggerEventIndex+1)
|
||||||
|
|
||||||
return { proposedTriggerEvent: state.proposedTriggerEvent, proposedTriggerEventIndex: state.proposedTriggerEventIndex + 1 };
|
return { proposedTriggerEvent: state.proposedTriggerEvent, proposedTriggerEventIndex: state.proposedTriggerEventIndex+1 };
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Generated
+354
-379
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,6 @@
|
|||||||
"@langchain/core": "^1.1.17",
|
"@langchain/core": "^1.1.17",
|
||||||
"@langchain/langgraph": "^1.1.2",
|
"@langchain/langgraph": "^1.1.2",
|
||||||
"@langchain/langgraph-sdk": "^1.5.5",
|
"@langchain/langgraph-sdk": "^1.5.5",
|
||||||
"@langchain/ollama": "^1.2.6",
|
|
||||||
"@langchain/openai": "^1.2.3",
|
"@langchain/openai": "^1.2.3",
|
||||||
"axios": "^1.13.5",
|
"axios": "^1.13.5",
|
||||||
"compute-cosine-similarity": "^1.1.0",
|
"compute-cosine-similarity": "^1.1.0",
|
||||||
|
|||||||
@@ -16,7 +16,4 @@ Relevent examples are included in preceeding messages, use these as exact inspir
|
|||||||
The claim to normalize is:
|
The claim to normalize is:
|
||||||
###TITLE###
|
###TITLE###
|
||||||
|
|
||||||
Produce no other text other than the condensed claim, surrounded <norm></norm>
|
Produce no other text other than the condensed claim.
|
||||||
|
|
||||||
For example: BREAKING: the sky is green!
|
|
||||||
Becomes: <norm>The sky is green</norm>
|
|
||||||
@@ -17,15 +17,6 @@ Include a url to a source for your trigger event (not a web search, a specific u
|
|||||||
Include the date that the event happened ("March 2022" for exmaple)
|
Include the date that the event happened ("March 2022" for exmaple)
|
||||||
|
|
||||||
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url,Date".
|
Use a JSON format with each entry containing "Event,ReasoningWhyRelevant,SearchQuery,Url,Date".
|
||||||
Return ONLY JSON, no extra text. Wrap it like this:
|
|
||||||
<json>
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"Event": "Example"
|
|
||||||
...
|
|
||||||
}
|
|
||||||
]
|
|
||||||
</json>
|
|
||||||
|
|
||||||
Multiple tool invocations should be requested at once, if applicable.
|
Multiple tool invocations should be requested at once, if applicable.
|
||||||
Use your abilities to look between the lines and produce some insightful analysis, thinking both short and long term.
|
Use your abilities to look between the lines and produce some insightful analysis, thinking both short and long term.
|
||||||
@@ -35,8 +26,4 @@ Events will be reordered as part of processing, each statement must stand alone
|
|||||||
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
The preceeding messages act as examples of previous responses to potentially ficitonal events and scores given.
|
||||||
Analysis should only be completed for proposed events that would graner >0.7 points
|
Analysis should only be completed for proposed events that would graner >0.7 points
|
||||||
|
|
||||||
Since URLs change frequently, use tools to retreive up to date informaiton everytime, provided examples or existing knowledge will be wrong or out of date.
|
|
||||||
|
|
||||||
Remember to return just json enclosed by <json></json>
|
|
||||||
|
|
||||||
Lets go through it step by step
|
Lets go through it step by step
|
||||||
@@ -7,7 +7,7 @@ export async function evaluateWithEnsemble({
|
|||||||
answer: string;
|
answer: string;
|
||||||
method: string
|
method: string
|
||||||
}): Promise<{ validProb: number; invalidProb: number; }> {
|
}): Promise<{ validProb: number; invalidProb: number; }> {
|
||||||
const res = await axios.post(process.env.RANKING_URL ?? "http://localhost:8000/evaluate", {
|
const res = await axios.post("http://localhost:8000/evaluate", {
|
||||||
answer,
|
answer,
|
||||||
method
|
method
|
||||||
}, {timeout: 0});
|
}, {timeout: 0});
|
||||||
@@ -18,15 +18,11 @@ export async function evaluateWithEnsemble({
|
|||||||
return {validProb, invalidProb};
|
return {validProb, invalidProb};
|
||||||
}
|
}
|
||||||
|
|
||||||
// import dotenv from "dotenv";
|
// let res = await evaluateWithRoberta({answer: "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in March–August 2020)"});
|
||||||
|
|
||||||
// dotenv.config();
|
|
||||||
|
|
||||||
// let res = await evaluateWithEnsemble({method:"flan" ,answer: "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in March–August 2020)"});
|
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
|
|
||||||
// res = await evaluateWithEnsemble({method:"roberta" ,answer: "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film."});
|
// res = await evaluateWithRoberta({answer: "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film."});
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
|
|
||||||
// res = await evaluateWithEnsemble({method:"logreg" ,answer: "The COVID-19 Pandemic"});
|
// res = await evaluateWithRoberta({answer: "The COVID-19 Pandemic"});
|
||||||
// console.log(res)
|
// console.log(res)
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
|
||||||
|
export async function evaluateWithRagas({
|
||||||
|
question,
|
||||||
|
answer,
|
||||||
|
contexts,
|
||||||
|
}: {
|
||||||
|
question: string;
|
||||||
|
answer: string;
|
||||||
|
contexts: string[];
|
||||||
|
}) {
|
||||||
|
const res = await axios.post("http://localhost:8001/evaluate", {
|
||||||
|
question,
|
||||||
|
answer,
|
||||||
|
contexts,
|
||||||
|
});
|
||||||
|
|
||||||
|
return res.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// let res = await evaluateWithRagas({question: "Who was Bill Nye", answer: "Bill Nye was a Scientist", contexts: ["Bill nye was a Scientist"]});
|
||||||
|
// console.log(res)
|
||||||
@@ -26,9 +26,6 @@ async function extractWebpageContentWorker(url: string): Promise<string[]> {
|
|||||||
try {
|
try {
|
||||||
const options = new firefox.Options();
|
const options = new firefox.Options();
|
||||||
options.addArguments("--headless");
|
options.addArguments("--headless");
|
||||||
options.addArguments("--disable-gpu");
|
|
||||||
options.addArguments("--no-sandbox"); // Linux sandbox issues
|
|
||||||
options.addArguments("--disable-dev-shm-usage"); // /dev/shm issues
|
|
||||||
driver = await new Builder()
|
driver = await new Builder()
|
||||||
.forBrowser(Browser.FIREFOX)
|
.forBrowser(Browser.FIREFOX)
|
||||||
.setFirefoxOptions(options)
|
.setFirefoxOptions(options)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ set -e
|
|||||||
run_agent () {
|
run_agent () {
|
||||||
echo "Starting LangGraph agent..."
|
echo "Starting LangGraph agent..."
|
||||||
cd agent
|
cd agent
|
||||||
npx @langchain/langgraph-cli dev
|
npx @langchain/langgraph-cli@1.1.17 dev
|
||||||
}
|
}
|
||||||
|
|
||||||
run_ensemble_service () {
|
run_ensemble_service () {
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ LABEL_TO_INT = {v: k for k, v in INT_TO_LABEL.items()}
|
|||||||
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_PATH)
|
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_PATH)
|
||||||
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PATH)
|
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PATH)
|
||||||
|
|
||||||
device = torch.device("cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
flan_model.to(device)
|
flan_model.to(device)
|
||||||
flan_model.eval()
|
flan_model.eval()
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,10 @@ const AGENT_NAME = process.env.AGENT ?? "agent";
|
|||||||
*/
|
*/
|
||||||
const MODE = process.env.MODE ?? "claim";
|
const MODE = process.env.MODE ?? "claim";
|
||||||
|
|
||||||
const MAX_CONCURRENCY = 1;
|
const MAX_CONCURRENCY = 5;
|
||||||
|
|
||||||
|
const OFFSET = parseInt(process.env.OFFSET ?? "0", 10);
|
||||||
|
const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null;
|
||||||
|
|
||||||
const client = new Client({ apiUrl: API_URL });
|
const client = new Client({ apiUrl: API_URL });
|
||||||
|
|
||||||
@@ -164,9 +167,18 @@ async function processRecord(record: any): Promise<ResultRecord> {
|
|||||||
async function main() {
|
async function main() {
|
||||||
console.log("Reading input file...");
|
console.log("Reading input file...");
|
||||||
|
|
||||||
const records = await loadInputs();
|
const allRecords = await loadInputs();
|
||||||
|
|
||||||
console.log(`Loaded ${records.length} records`);
|
console.log(`Loaded ${allRecords.length} records`);
|
||||||
|
|
||||||
|
const records = allRecords.slice(
|
||||||
|
OFFSET,
|
||||||
|
LIMIT !== null ? OFFSET + LIMIT : undefined
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})`
|
||||||
|
);
|
||||||
|
|
||||||
fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });
|
fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def init_driver():
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
options.add_argument("--disable-gpu")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
options.add_argument("--headless")
|
||||||
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
options.add_argument("--window-size=1920,1080")
|
||||||
|
prefs = {
|
||||||
|
"profile.managed_default_content_settings.images": 2, # block images
|
||||||
|
"profile.default_content_setting_values.stylesheets": 2, # block CSS
|
||||||
|
"profile.managed_default_content_settings.cookies": 2, # optional
|
||||||
|
}
|
||||||
|
options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
driver.set_page_load_timeout(30)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
def is_root_url(url):
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return parsed.path in ("", "/")
|
||||||
|
|
||||||
|
def is_404_page(driver):
|
||||||
|
"""Safely check for 404, handling stale elements."""
|
||||||
|
try:
|
||||||
|
title = driver.title.lower()
|
||||||
|
body_text = driver.find_element("tag name", "body").text.lower()
|
||||||
|
return "404" in title or "404" in body_text
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_url_selenium(url):
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
driver = init_driver()
|
||||||
|
driver.get(url)
|
||||||
|
# 404 check
|
||||||
|
if is_404_page(driver):
|
||||||
|
return False, "404 page detected"
|
||||||
|
# Root URL after redirects
|
||||||
|
final_url = driver.current_url
|
||||||
|
if is_root_url(final_url):
|
||||||
|
return False, f"Redirected to root URL ({final_url})"
|
||||||
|
return True, None
|
||||||
|
except (WebDriverException, TimeoutException) as e:
|
||||||
|
return False, str(e)
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
def process_event(event):
|
||||||
|
"""Process an event only if score > 0.4."""
|
||||||
|
score = event.get("score", 0)
|
||||||
|
if score <= 0.4:
|
||||||
|
return None, False, "Score too low"
|
||||||
|
url = event.get("Url")
|
||||||
|
if not url:
|
||||||
|
return None, False, "No URL"
|
||||||
|
is_valid, error_msg = check_url_selenium(url)
|
||||||
|
event["url_valid"] = is_valid
|
||||||
|
return url, is_valid, error_msg
|
||||||
|
|
||||||
|
def process_jsonl_file(file_path, max_workers=4):
|
||||||
|
invalid_urls = []
|
||||||
|
valid_urls = 0
|
||||||
|
|
||||||
|
# Gather events with score > 0.4
|
||||||
|
urls_to_check = []
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line_data = json.loads(line)
|
||||||
|
if line_data.get("status") != "success":
|
||||||
|
continue
|
||||||
|
for event in line_data.get("events", []):
|
||||||
|
if event.get("score", 0) > 0.4:
|
||||||
|
urls_to_check.append(event)
|
||||||
|
|
||||||
|
total_urls = len(urls_to_check)
|
||||||
|
|
||||||
|
# ThreadPoolExecutor with tqdm progress bar
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
||||||
|
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
||||||
|
url, is_valid, error_msg = future.result()
|
||||||
|
if not is_valid and url:
|
||||||
|
invalid_urls.append((url, error_msg))
|
||||||
|
else:
|
||||||
|
valid_urls += 1
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
if invalid_urls:
|
||||||
|
print("\nList of invalid URLs and reasons:")
|
||||||
|
for url, err in invalid_urls:
|
||||||
|
print(f"{url} --> {err}")
|
||||||
|
print("\n=== URL Validation Summary ===")
|
||||||
|
print(f"Total URLs processed: {total_urls}")
|
||||||
|
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||||
|
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||||
|
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||||
|
parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
process_jsonl_file(args.file_path, max_workers=args.workers)
|
||||||
@@ -27,7 +27,7 @@ DEFAULT_PARAMS = [
|
|||||||
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
|
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
|
||||||
]
|
]
|
||||||
|
|
||||||
NUM_RANDOM_CLAIMS = 200
|
NUM_RANDOM_CLAIMS = 2000
|
||||||
|
|
||||||
INPUT_FILE = "../../data/input.jsonl"
|
INPUT_FILE = "../../data/input.jsonl"
|
||||||
OUTPUT_FILE = "../../data/claims.json"
|
OUTPUT_FILE = "../../data/claims.json"
|
||||||
|
|||||||
Reference in New Issue
Block a user