Add filtering from known disinformation sources

This commit is contained in:
William Jeynes
2026-02-22 15:14:58 +00:00
parent 8ffe8dec82
commit d1ab938c0b
6 changed files with 90 additions and 8 deletions
+22 -3
View File
@@ -21,6 +21,7 @@
"fs": "^0.0.1-security",
"langchain": "^1.2.14",
"selenium-webdriver": "^4.40.0",
"tldts": "^7.0.23",
"wink-bm25-text-search": "^3.1.2",
"wink-nlp-utils": "^2.1.0",
"winston": "^3.19.0"
@@ -2681,9 +2682,9 @@
}
},
"node_modules/tar": {
"version": "7.5.7",
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.7.tgz",
"integrity": "sha512-fov56fJiRuThVFXD6o6/Q354S7pnWMJIVlDBYijsTNx6jKSE4pvrDTs6lUnmGvNyfJwFQQwWy3owKz1ucIhveQ==",
"version": "7.5.9",
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.9.tgz",
"integrity": "sha512-BTLcK0xsDh2+PUe9F6c2TlRp4zOOBMTkoQHQIWSIzI0R7KG46uEwq4OPk2W7bZcprBMsuaeFsqwYr7pjh6CuHg==",
"license": "BlueOak-1.0.0",
"dependencies": {
"@isaacs/fs-minipass": "^4.0.0",
@@ -2702,6 +2703,24 @@
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
"license": "MIT"
},
"node_modules/tldts": {
"version": "7.0.23",
"resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.23.tgz",
"integrity": "sha512-ASdhgQIBSay0R/eXggAkQ53G4nTJqTXqC2kbaBbdDwM7SkjyZyO0OaaN1/FH7U/yCeqOHDwFO5j8+Os/IS1dXw==",
"license": "MIT",
"dependencies": {
"tldts-core": "^7.0.23"
},
"bin": {
"tldts": "bin/cli.js"
}
},
"node_modules/tldts-core": {
"version": "7.0.23",
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.23.tgz",
"integrity": "sha512-0g9vrtDQLrNIiCj22HSe9d4mLVG3g5ph5DZ8zCKBr4OtrspmNB6ss7hVyzArAeE88ceZocIEGkyW1Ime7fxPtQ==",
"license": "MIT"
},
"node_modules/tmp": {
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
+1
View File
@@ -25,6 +25,7 @@
"fs": "^0.0.1-security",
"langchain": "^1.2.14",
"selenium-webdriver": "^4.40.0",
"tldts": "^7.0.23",
"wink-bm25-text-search": "^3.1.2",
"wink-nlp-utils": "^2.1.0",
"winston": "^3.19.0"
+55
View File
@@ -0,0 +1,55 @@
import path from "path";
import { logger } from "../utils/logger"
import fs from "fs";
import { parse } from "tldts";
type RawSiteRecord = {
Domain: string;
Score: string;
};
type SiteScore = {
domain: string;
score: number;
};
const FILE_PATH = "../data/Iffy.json"
function parseSiteScores(): SiteScore[] {
const raw = fs.readFileSync(path.resolve(FILE_PATH), "utf-8");
const data: unknown = JSON.parse(raw);
if (!Array.isArray(data)) {
throw new Error("Invalid JSON: expected array");
}
return data.map((item) => {
const record = item as RawSiteRecord;
return {
domain: record.Domain,
score: Number(record.Score),
};
});
}
let scores: SiteScore[] | null = null
export function checkDisinfo(url: string): boolean {
if (scores == null) {
scores = parseSiteScores();
}
const domain = parse(url).domain;
const match = scores.find(itm => itm.domain == domain)
if (match != null) {
logger.warn("Bad source %s detected with score %s", url, match.score)
return true;
}
return false;
}
// console.log(checkDisinfo("http://www.zerohedge.com"))
+5
View File
@@ -1,4 +1,5 @@
import axios from "axios";
import { checkDisinfo } from "./checkDisinfo";
export async function queryScraper(query: string): Promise<string[]> {
const instance = process.env.SCRAPER_INSTANCE;
@@ -44,6 +45,10 @@ export async function queryScraper(query: string): Promise<string[]> {
const context = data.web ?? [];
const lines: string[] = context.map((item: any) => {
if (checkDisinfo(item.url)) {
return "";
}
const title = (item.title ?? "").trim();
const desc = (item.description ?? "").trim();
const link = (item.url ?? "").trim();
+4 -4
View File
@@ -11,16 +11,16 @@ BASE_URL = "https://dbkf.ontotext.com/rest-api/search/documents"
DEFAULT_PARAMS = {
"concept": "http://weverify.eu/resource/Concept/Q212",
"documentTypes": "http://schema.org/Claim",
# "documentTypes": "http://schema.org/Claim",
"from": "2000-01-01",
"to": "2026-10-17",
"to": "2026-02-19",
"lang": "en",
"limit": 300, # Max per page
"limit": 5000, # Max per page
"page": 1,
"orderBy": "date"
}
NUM_RANDOM_CLAIMS = 10
NUM_RANDOM_CLAIMS = 20
OUTPUT_FILE = "../../data/claims.json"
+3 -1
View File
@@ -1,3 +1,4 @@
import copy
import streamlit as st
import json
import random
@@ -154,6 +155,7 @@ elif view == "Single Claim Random":
claims = bundle["claims"]
st.subheader(entry.get("text"))
st.write(entry.get("normalized", ""))
# --------------------------
# Stable Drag IDs (FIX)
@@ -267,7 +269,7 @@ elif view == "Single Claim Random":
claim_obj["human_score"] = round(score, 3)
save_data(INPUT_FILE, st.session_state.data)
save_data_clean(OUTPUT_FILE, st.session_state.data)
save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
print("Ranking converted to scores and saved!")