Add filtering from known disinformation sources
This commit is contained in:
Generated
+22
-3
@@ -21,6 +21,7 @@
|
|||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
"langchain": "^1.2.14",
|
"langchain": "^1.2.14",
|
||||||
"selenium-webdriver": "^4.40.0",
|
"selenium-webdriver": "^4.40.0",
|
||||||
|
"tldts": "^7.0.23",
|
||||||
"wink-bm25-text-search": "^3.1.2",
|
"wink-bm25-text-search": "^3.1.2",
|
||||||
"wink-nlp-utils": "^2.1.0",
|
"wink-nlp-utils": "^2.1.0",
|
||||||
"winston": "^3.19.0"
|
"winston": "^3.19.0"
|
||||||
@@ -2681,9 +2682,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/tar": {
|
"node_modules/tar": {
|
||||||
"version": "7.5.7",
|
"version": "7.5.9",
|
||||||
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.7.tgz",
|
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.9.tgz",
|
||||||
"integrity": "sha512-fov56fJiRuThVFXD6o6/Q354S7pnWMJIVlDBYijsTNx6jKSE4pvrDTs6lUnmGvNyfJwFQQwWy3owKz1ucIhveQ==",
|
"integrity": "sha512-BTLcK0xsDh2+PUe9F6c2TlRp4zOOBMTkoQHQIWSIzI0R7KG46uEwq4OPk2W7bZcprBMsuaeFsqwYr7pjh6CuHg==",
|
||||||
"license": "BlueOak-1.0.0",
|
"license": "BlueOak-1.0.0",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@isaacs/fs-minipass": "^4.0.0",
|
"@isaacs/fs-minipass": "^4.0.0",
|
||||||
@@ -2702,6 +2703,24 @@
|
|||||||
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
|
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/tldts": {
|
||||||
|
"version": "7.0.23",
|
||||||
|
"resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.23.tgz",
|
||||||
|
"integrity": "sha512-ASdhgQIBSay0R/eXggAkQ53G4nTJqTXqC2kbaBbdDwM7SkjyZyO0OaaN1/FH7U/yCeqOHDwFO5j8+Os/IS1dXw==",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"tldts-core": "^7.0.23"
|
||||||
|
},
|
||||||
|
"bin": {
|
||||||
|
"tldts": "bin/cli.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/tldts-core": {
|
||||||
|
"version": "7.0.23",
|
||||||
|
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.23.tgz",
|
||||||
|
"integrity": "sha512-0g9vrtDQLrNIiCj22HSe9d4mLVG3g5ph5DZ8zCKBr4OtrspmNB6ss7hVyzArAeE88ceZocIEGkyW1Ime7fxPtQ==",
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/tmp": {
|
"node_modules/tmp": {
|
||||||
"version": "0.2.5",
|
"version": "0.2.5",
|
||||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
|
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
|
||||||
|
|||||||
@@ -25,6 +25,7 @@
|
|||||||
"fs": "^0.0.1-security",
|
"fs": "^0.0.1-security",
|
||||||
"langchain": "^1.2.14",
|
"langchain": "^1.2.14",
|
||||||
"selenium-webdriver": "^4.40.0",
|
"selenium-webdriver": "^4.40.0",
|
||||||
|
"tldts": "^7.0.23",
|
||||||
"wink-bm25-text-search": "^3.1.2",
|
"wink-bm25-text-search": "^3.1.2",
|
||||||
"wink-nlp-utils": "^2.1.0",
|
"wink-nlp-utils": "^2.1.0",
|
||||||
"winston": "^3.19.0"
|
"winston": "^3.19.0"
|
||||||
|
|||||||
@@ -0,0 +1,55 @@
|
|||||||
|
import path from "path";
|
||||||
|
import { logger } from "../utils/logger"
|
||||||
|
import fs from "fs";
|
||||||
|
import { parse } from "tldts";
|
||||||
|
|
||||||
|
type RawSiteRecord = {
|
||||||
|
Domain: string;
|
||||||
|
Score: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type SiteScore = {
|
||||||
|
domain: string;
|
||||||
|
score: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
const FILE_PATH = "../data/Iffy.json"
|
||||||
|
|
||||||
|
function parseSiteScores(): SiteScore[] {
|
||||||
|
const raw = fs.readFileSync(path.resolve(FILE_PATH), "utf-8");
|
||||||
|
const data: unknown = JSON.parse(raw);
|
||||||
|
|
||||||
|
if (!Array.isArray(data)) {
|
||||||
|
throw new Error("Invalid JSON: expected array");
|
||||||
|
}
|
||||||
|
|
||||||
|
return data.map((item) => {
|
||||||
|
const record = item as RawSiteRecord;
|
||||||
|
|
||||||
|
return {
|
||||||
|
domain: record.Domain,
|
||||||
|
score: Number(record.Score),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let scores: SiteScore[] | null = null
|
||||||
|
|
||||||
|
export function checkDisinfo(url: string): boolean {
|
||||||
|
if (scores == null) {
|
||||||
|
scores = parseSiteScores();
|
||||||
|
}
|
||||||
|
|
||||||
|
const domain = parse(url).domain;
|
||||||
|
|
||||||
|
const match = scores.find(itm => itm.domain == domain)
|
||||||
|
if (match != null) {
|
||||||
|
logger.warn("Bad source %s detected with score %s", url, match.score)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// console.log(checkDisinfo("http://www.zerohedge.com"))
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
|
import { checkDisinfo } from "./checkDisinfo";
|
||||||
|
|
||||||
export async function queryScraper(query: string): Promise<string[]> {
|
export async function queryScraper(query: string): Promise<string[]> {
|
||||||
const instance = process.env.SCRAPER_INSTANCE;
|
const instance = process.env.SCRAPER_INSTANCE;
|
||||||
@@ -44,6 +45,10 @@ export async function queryScraper(query: string): Promise<string[]> {
|
|||||||
const context = data.web ?? [];
|
const context = data.web ?? [];
|
||||||
|
|
||||||
const lines: string[] = context.map((item: any) => {
|
const lines: string[] = context.map((item: any) => {
|
||||||
|
if (checkDisinfo(item.url)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
const title = (item.title ?? "").trim();
|
const title = (item.title ?? "").trim();
|
||||||
const desc = (item.description ?? "").trim();
|
const desc = (item.description ?? "").trim();
|
||||||
const link = (item.url ?? "").trim();
|
const link = (item.url ?? "").trim();
|
||||||
|
|||||||
@@ -11,16 +11,16 @@ BASE_URL = "https://dbkf.ontotext.com/rest-api/search/documents"
|
|||||||
|
|
||||||
DEFAULT_PARAMS = {
|
DEFAULT_PARAMS = {
|
||||||
"concept": "http://weverify.eu/resource/Concept/Q212",
|
"concept": "http://weverify.eu/resource/Concept/Q212",
|
||||||
"documentTypes": "http://schema.org/Claim",
|
# "documentTypes": "http://schema.org/Claim",
|
||||||
"from": "2000-01-01",
|
"from": "2000-01-01",
|
||||||
"to": "2026-10-17",
|
"to": "2026-02-19",
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"limit": 300, # Max per page
|
"limit": 5000, # Max per page
|
||||||
"page": 1,
|
"page": 1,
|
||||||
"orderBy": "date"
|
"orderBy": "date"
|
||||||
}
|
}
|
||||||
|
|
||||||
NUM_RANDOM_CLAIMS = 10
|
NUM_RANDOM_CLAIMS = 20
|
||||||
|
|
||||||
OUTPUT_FILE = "../../data/claims.json"
|
OUTPUT_FILE = "../../data/claims.json"
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
@@ -154,6 +155,7 @@ elif view == "Single Claim Random":
|
|||||||
claims = bundle["claims"]
|
claims = bundle["claims"]
|
||||||
|
|
||||||
st.subheader(entry.get("text"))
|
st.subheader(entry.get("text"))
|
||||||
|
st.write(entry.get("normalized", ""))
|
||||||
|
|
||||||
# --------------------------
|
# --------------------------
|
||||||
# Stable Drag IDs (FIX)
|
# Stable Drag IDs (FIX)
|
||||||
@@ -267,7 +269,7 @@ elif view == "Single Claim Random":
|
|||||||
claim_obj["human_score"] = round(score, 3)
|
claim_obj["human_score"] = round(score, 3)
|
||||||
|
|
||||||
save_data(INPUT_FILE, st.session_state.data)
|
save_data(INPUT_FILE, st.session_state.data)
|
||||||
save_data_clean(OUTPUT_FILE, st.session_state.data)
|
save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
|
||||||
|
|
||||||
|
|
||||||
print("Ranking converted to scores and saved!")
|
print("Ranking converted to scores and saved!")
|
||||||
|
|||||||
Reference in New Issue
Block a user