diff --git a/agent/package-lock.json b/agent/package-lock.json index 45f72b0..7f344b3 100644 --- a/agent/package-lock.json +++ b/agent/package-lock.json @@ -21,6 +21,7 @@ "fs": "^0.0.1-security", "langchain": "^1.2.14", "selenium-webdriver": "^4.40.0", + "tldts": "^7.0.23", "wink-bm25-text-search": "^3.1.2", "wink-nlp-utils": "^2.1.0", "winston": "^3.19.0" @@ -2681,9 +2682,9 @@ } }, "node_modules/tar": { - "version": "7.5.7", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.7.tgz", - "integrity": "sha512-fov56fJiRuThVFXD6o6/Q354S7pnWMJIVlDBYijsTNx6jKSE4pvrDTs6lUnmGvNyfJwFQQwWy3owKz1ucIhveQ==", + "version": "7.5.9", + "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.9.tgz", + "integrity": "sha512-BTLcK0xsDh2+PUe9F6c2TlRp4zOOBMTkoQHQIWSIzI0R7KG46uEwq4OPk2W7bZcprBMsuaeFsqwYr7pjh6CuHg==", "license": "BlueOak-1.0.0", "dependencies": { "@isaacs/fs-minipass": "^4.0.0", @@ -2702,6 +2703,24 @@ "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==", "license": "MIT" }, + "node_modules/tldts": { + "version": "7.0.23", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.23.tgz", + "integrity": "sha512-ASdhgQIBSay0R/eXggAkQ53G4nTJqTXqC2kbaBbdDwM7SkjyZyO0OaaN1/FH7U/yCeqOHDwFO5j8+Os/IS1dXw==", + "license": "MIT", + "dependencies": { + "tldts-core": "^7.0.23" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "7.0.23", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.23.tgz", + "integrity": "sha512-0g9vrtDQLrNIiCj22HSe9d4mLVG3g5ph5DZ8zCKBr4OtrspmNB6ss7hVyzArAeE88ceZocIEGkyW1Ime7fxPtQ==", + "license": "MIT" + }, "node_modules/tmp": { "version": "0.2.5", "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz", diff --git a/agent/package.json b/agent/package.json index 44d1e5f..f82328f 100644 --- a/agent/package.json +++ b/agent/package.json @@ -25,6 +25,7 @@ "fs": "^0.0.1-security", "langchain": "^1.2.14", "selenium-webdriver": "^4.40.0", + "tldts": "^7.0.23", "wink-bm25-text-search": "^3.1.2", "wink-nlp-utils": "^2.1.0", "winston": "^3.19.0" diff --git a/agent/tools/checkDisinfo.ts b/agent/tools/checkDisinfo.ts new file mode 100644 index 0000000..839b413 --- /dev/null +++ b/agent/tools/checkDisinfo.ts @@ -0,0 +1,55 @@ +import path from "path"; +import { logger } from "../utils/logger" +import fs from "fs"; +import { parse } from "tldts"; + +type RawSiteRecord = { + Domain: string; + Score: string; +}; + +type SiteScore = { + domain: string; + score: number; +}; + +const FILE_PATH = "../data/Iffy.json" + +function parseSiteScores(): SiteScore[] { + const raw = fs.readFileSync(path.resolve(FILE_PATH), "utf-8"); + const data: unknown = JSON.parse(raw); + + if (!Array.isArray(data)) { + throw new Error("Invalid JSON: expected array"); + } + + return data.map((item) => { + const record = item as RawSiteRecord; + + return { + domain: record.Domain, + score: Number(record.Score), + }; + }); +} + +let scores: SiteScore[] | null = null + +export function checkDisinfo(url: string): boolean { + if (scores == null) { + scores = parseSiteScores(); + } + + const domain = parse(url).domain; + + const match = scores.find(itm => itm.domain == domain) + if (match != null) { + logger.warn("Bad source %s detected with score %s", url, match.score) + return true; + } + + return false; +} + + +// console.log(checkDisinfo("http://www.zerohedge.com")) \ No newline at end of file diff --git a/agent/tools/webSearch.ts b/agent/tools/webSearch.ts index 833798e..2b47ef0 100644 --- a/agent/tools/webSearch.ts +++ b/agent/tools/webSearch.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { checkDisinfo } from "./checkDisinfo"; export async function queryScraper(query: string): Promise { const instance = process.env.SCRAPER_INSTANCE; @@ -44,6 +45,10 @@ export async function queryScraper(query: string): Promise { const context = data.web ?? []; const lines: string[] = context.map((item: any) => { + if (checkDisinfo(item.url)) { + return ""; + } + const title = (item.title ?? "").trim(); const desc = (item.description ?? "").trim(); const link = (item.url ?? "").trim(); diff --git a/supporting/dbkf/fetch.py b/supporting/dbkf/fetch.py index 3ba8220..994d3cd 100644 --- a/supporting/dbkf/fetch.py +++ b/supporting/dbkf/fetch.py @@ -11,16 +11,16 @@ BASE_URL = "https://dbkf.ontotext.com/rest-api/search/documents" DEFAULT_PARAMS = { "concept": "http://weverify.eu/resource/Concept/Q212", - "documentTypes": "http://schema.org/Claim", + # "documentTypes": "http://schema.org/Claim", "from": "2000-01-01", - "to": "2026-10-17", + "to": "2026-02-19", "lang": "en", - "limit": 300, # Max per page + "limit": 5000, # Max per page "page": 1, "orderBy": "date" } -NUM_RANDOM_CLAIMS = 10 +NUM_RANDOM_CLAIMS = 20 OUTPUT_FILE = "../../data/claims.json" diff --git a/supporting/scorer/display.py b/supporting/scorer/display.py index 3228b96..355fef8 100644 --- a/supporting/scorer/display.py +++ b/supporting/scorer/display.py @@ -1,3 +1,4 @@ +import copy import streamlit as st import json import random @@ -154,6 +155,7 @@ elif view == "Single Claim Random": claims = bundle["claims"] st.subheader(entry.get("text")) + st.write(entry.get("normalized", "")) # -------------------------- # Stable Drag IDs (FIX) @@ -267,7 +269,7 @@ elif view == "Single Claim Random": claim_obj["human_score"] = round(score, 3) save_data(INPUT_FILE, st.session_state.data) - save_data_clean(OUTPUT_FILE, st.session_state.data) + save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data)) print("Ranking converted to scores and saved!")