Add filtering from known disinformation sources
This commit is contained in:
Generated
+22
-3
@@ -21,6 +21,7 @@
|
||||
"fs": "^0.0.1-security",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"tldts": "^7.0.23",
|
||||
"wink-bm25-text-search": "^3.1.2",
|
||||
"wink-nlp-utils": "^2.1.0",
|
||||
"winston": "^3.19.0"
|
||||
@@ -2681,9 +2682,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/tar": {
|
||||
"version": "7.5.7",
|
||||
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.7.tgz",
|
||||
"integrity": "sha512-fov56fJiRuThVFXD6o6/Q354S7pnWMJIVlDBYijsTNx6jKSE4pvrDTs6lUnmGvNyfJwFQQwWy3owKz1ucIhveQ==",
|
||||
"version": "7.5.9",
|
||||
"resolved": "https://registry.npmjs.org/tar/-/tar-7.5.9.tgz",
|
||||
"integrity": "sha512-BTLcK0xsDh2+PUe9F6c2TlRp4zOOBMTkoQHQIWSIzI0R7KG46uEwq4OPk2W7bZcprBMsuaeFsqwYr7pjh6CuHg==",
|
||||
"license": "BlueOak-1.0.0",
|
||||
"dependencies": {
|
||||
"@isaacs/fs-minipass": "^4.0.0",
|
||||
@@ -2702,6 +2703,24 @@
|
||||
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tldts": {
|
||||
"version": "7.0.23",
|
||||
"resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.23.tgz",
|
||||
"integrity": "sha512-ASdhgQIBSay0R/eXggAkQ53G4nTJqTXqC2kbaBbdDwM7SkjyZyO0OaaN1/FH7U/yCeqOHDwFO5j8+Os/IS1dXw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tldts-core": "^7.0.23"
|
||||
},
|
||||
"bin": {
|
||||
"tldts": "bin/cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/tldts-core": {
|
||||
"version": "7.0.23",
|
||||
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.23.tgz",
|
||||
"integrity": "sha512-0g9vrtDQLrNIiCj22HSe9d4mLVG3g5ph5DZ8zCKBr4OtrspmNB6ss7hVyzArAeE88ceZocIEGkyW1Ime7fxPtQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tmp": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
"fs": "^0.0.1-security",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"tldts": "^7.0.23",
|
||||
"wink-bm25-text-search": "^3.1.2",
|
||||
"wink-nlp-utils": "^2.1.0",
|
||||
"winston": "^3.19.0"
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
import path from "path";
|
||||
import { logger } from "../utils/logger"
|
||||
import fs from "fs";
|
||||
import { parse } from "tldts";
|
||||
|
||||
type RawSiteRecord = {
|
||||
Domain: string;
|
||||
Score: string;
|
||||
};
|
||||
|
||||
type SiteScore = {
|
||||
domain: string;
|
||||
score: number;
|
||||
};
|
||||
|
||||
const FILE_PATH = "../data/Iffy.json"
|
||||
|
||||
function parseSiteScores(): SiteScore[] {
|
||||
const raw = fs.readFileSync(path.resolve(FILE_PATH), "utf-8");
|
||||
const data: unknown = JSON.parse(raw);
|
||||
|
||||
if (!Array.isArray(data)) {
|
||||
throw new Error("Invalid JSON: expected array");
|
||||
}
|
||||
|
||||
return data.map((item) => {
|
||||
const record = item as RawSiteRecord;
|
||||
|
||||
return {
|
||||
domain: record.Domain,
|
||||
score: Number(record.Score),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let scores: SiteScore[] | null = null
|
||||
|
||||
export function checkDisinfo(url: string): boolean {
|
||||
if (scores == null) {
|
||||
scores = parseSiteScores();
|
||||
}
|
||||
|
||||
const domain = parse(url).domain;
|
||||
|
||||
const match = scores.find(itm => itm.domain == domain)
|
||||
if (match != null) {
|
||||
logger.warn("Bad source %s detected with score %s", url, match.score)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// console.log(checkDisinfo("http://www.zerohedge.com"))
|
||||
@@ -1,4 +1,5 @@
|
||||
import axios from "axios";
|
||||
import { checkDisinfo } from "./checkDisinfo";
|
||||
|
||||
export async function queryScraper(query: string): Promise<string[]> {
|
||||
const instance = process.env.SCRAPER_INSTANCE;
|
||||
@@ -44,6 +45,10 @@ export async function queryScraper(query: string): Promise<string[]> {
|
||||
const context = data.web ?? [];
|
||||
|
||||
const lines: string[] = context.map((item: any) => {
|
||||
if (checkDisinfo(item.url)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const title = (item.title ?? "").trim();
|
||||
const desc = (item.description ?? "").trim();
|
||||
const link = (item.url ?? "").trim();
|
||||
|
||||
Reference in New Issue
Block a user