Allow multiple source CSV files for normalisation. Implement real model node. Add normalizarion prompt. Implement normalization setup. Start on RAG retreival functions

This commit is contained in:
William Jeynes
2026-02-09 16:32:40 +00:00
parent 8eaa7bfbff
commit 02eac0f553
9 changed files with 311 additions and 56 deletions
+6 -4
View File
@@ -8,14 +8,16 @@ import { createDummyModelNode } from "./nodes/dummyModel";
import { verificationSetup } from "./nodes/verificationSetup"; import { verificationSetup } from "./nodes/verificationSetup";
import { dummyRagasMetrics } from "./nodes/dummyRagasMetrics"; import { dummyRagasMetrics } from "./nodes/dummyRagasMetrics";
import { produceRanking } from "./nodes/produceRanking"; import { produceRanking } from "./nodes/produceRanking";
import { createModelNode } from "./nodes/model";
const triggerEventToolNode = createToolNode(arithmeticToolsByName); const triggerEventToolNode = createToolNode(arithmeticToolsByName);
const verificationToolNode = createToolNode(arithmeticToolsByName); const verificationToolNode = createToolNode(arithmeticToolsByName);
const dummyTriggerEventModel = createDummyModelNode("Trigger Events of"); const dummyTriggerEventModel = createDummyModelNode("Trigger Events of");
const dummyNormalisationModel = createDummyModelNode("Normalised");
const dummyVerificationModel = createDummyModelNode("verification of"); const dummyVerificationModel = createDummyModelNode("verification of");
const normalisationModel = createModelNode([], "normalization.txt");
const triggerEventToolConditional = createToolConditional("triggerEventToolNode", verificationSetup.name); const triggerEventToolConditional = createToolConditional("triggerEventToolNode", verificationSetup.name);
const verificationToolConditional = createToolConditional("verificationToolNode", produceRanking.name); const verificationToolConditional = createToolConditional("verificationToolNode", produceRanking.name);
@@ -25,7 +27,7 @@ const agent = new StateGraph(MessagesState)
//NODES //NODES
.addNode(normalizationSetup.name, normalizationSetup) .addNode(normalizationSetup.name, normalizationSetup)
.addNode("dummyNormalisationModel", dummyNormalisationModel) .addNode("normalisationModel", normalisationModel)
.addNode("triggerEventToolNode", triggerEventToolNode) .addNode("triggerEventToolNode", triggerEventToolNode)
.addNode("dummyTriggerEventModel", dummyTriggerEventModel) .addNode("dummyTriggerEventModel", dummyTriggerEventModel)
@@ -37,8 +39,8 @@ const agent = new StateGraph(MessagesState)
.addNode(produceRanking.name, produceRanking) .addNode(produceRanking.name, produceRanking)
.addEdge(START, normalizationSetup.name) .addEdge(START, normalizationSetup.name)
.addEdge(normalizationSetup.name, "dummyNormalisationModel") .addEdge(normalizationSetup.name, "normalisationModel")
.addEdge("dummyNormalisationModel", "dummyTriggerEventModel") .addEdge("normalisationModel", "dummyTriggerEventModel")
// @ts-expect-error // @ts-expect-error
.addConditionalEdges("dummyTriggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name]) .addConditionalEdges("dummyTriggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
+24 -21
View File
@@ -1,24 +1,27 @@
// import { SystemMessage } from "@langchain/core/messages"; import { HumanMessage, SystemMessage } from "@langchain/core/messages";
// import { GraphNode } from "@langchain/langgraph"; import { GraphNode } from "@langchain/langgraph";
// import { MessagesState } from "../state"; import { MessagesState } from "../state";
// import { arithmeticTools } from "../tools/arithmetic"; import { ChatOpenAI } from "@langchain/openai"
// import { ChatOpenAI } from "@langchain/openai" import { hydratePrompt } from "../prompts/hydratePrompt";
// const model = new ChatOpenAI({ export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
// model: "gpt-5-mini" return async (state) => {
// }); const sysPrompt = hydratePrompt(promptPath, state.disinformationTitle)
// const modelWithTools = model.bindTools(arithmeticTools); const model = new ChatOpenAI({
model: "gpt-5-mini"
});
const modelWithTools = model.bindTools(tools);
// export const llmCall: GraphNode<typeof MessagesState> = async (state) => { const response = await modelWithTools.invoke([
// const response = await modelWithTools.invoke([ new SystemMessage(
// new SystemMessage( sysPrompt
// "You are a helpful assistant tasked with performing arithmetic on a set of inputs. Any calculation, no matter how trivial, should be done with tools. Output the final answer with %%% on each side" ),
// ), ...state.messages,
// ...state.messages, ]);
// ]);
// return { return {
// messages: [response], messages: [response]
// llmCalls: 1, };
// }; };
// }; }
+10 -3
View File
@@ -1,9 +1,16 @@
import { GraphNode } from "@langchain/langgraph"; import { GraphNode } from "@langchain/langgraph";
import { MessagesState } from "../state"; import { MessagesState } from "../state";
import { HumanMessage } from "@langchain/core/messages"; import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
import { calculateSimilarity } from "../tools/clan/retreiveExamples";
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => { export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
//TODO: Implement claim normalisation, using few shot prompting and CLAN Dataset let similarityResults = await calculateSimilarity(state.disinformationTitle)
return { messages: [ new HumanMessage(state.disinformationTitle)] }; console.log(similarityResults)
let messages : BaseMessage[] = similarityResults.map((item) => {
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
})
return { messages: messages, disinformationTitle: state.disinformationTitle };
}; };
+171
View File
@@ -19,13 +19,21 @@
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"langchain": "^1.2.14", "langchain": "^1.2.14",
"selenium-webdriver": "^4.40.0",
"winston": "^3.19.0" "winston": "^3.19.0"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^25.1.0", "@types/node": "^25.1.0",
"@types/selenium-webdriver": "^4.35.5",
"tsx": "^4.21.0" "tsx": "^4.21.0"
} }
}, },
"node_modules/@bazel/runfiles": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/@bazel/runfiles/-/runfiles-6.5.0.tgz",
"integrity": "sha512-RzahvqTkfpY2jsDxo8YItPX+/iZ6hbiikw1YhE0bA9EKBR5Og8Pa6FHn9PO9M0zaXRVsr0GFQLKbB/0rzy9SzA==",
"license": "Apache-2.0"
},
"node_modules/@cfworker/json-schema": { "node_modules/@cfworker/json-schema": {
"version": "4.1.1", "version": "4.1.1",
"resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz", "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz",
@@ -1243,6 +1251,17 @@
"undici-types": "~7.16.0" "undici-types": "~7.16.0"
} }
}, },
"node_modules/@types/selenium-webdriver": {
"version": "4.35.5",
"resolved": "https://registry.npmjs.org/@types/selenium-webdriver/-/selenium-webdriver-4.35.5.tgz",
"integrity": "sha512-wCQCjWmahRkUAO7S703UAvBFkxz4o/rjX4T2AOSWKXSi0sTQPsrXxR0GjtFUT0ompedLkYH4R5HO5Urz0hyeog==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*",
"@types/ws": "*"
}
},
"node_modules/@types/triple-beam": { "node_modules/@types/triple-beam": {
"version": "1.3.5", "version": "1.3.5",
"resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz", "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz",
@@ -1255,6 +1274,16 @@
"integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==", "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/ws": {
"version": "8.18.1",
"resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
"integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/ansi-styles": { "node_modules/ansi-styles": {
"version": "5.2.0", "version": "5.2.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz",
@@ -1463,6 +1492,12 @@
"simple-wcswidth": "^1.1.2" "simple-wcswidth": "^1.1.2"
} }
}, },
"node_modules/core-util-is": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
"integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==",
"license": "MIT"
},
"node_modules/csv-parse": { "node_modules/csv-parse": {
"version": "6.1.0", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz", "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz",
@@ -1753,6 +1788,12 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/immediate": {
"version": "3.0.6",
"resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz",
"integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==",
"license": "MIT"
},
"node_modules/inherits": { "node_modules/inherits": {
"version": "2.0.4", "version": "2.0.4",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
@@ -1783,6 +1824,12 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/isarray": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
"license": "MIT"
},
"node_modules/js-tiktoken": { "node_modules/js-tiktoken": {
"version": "1.0.21", "version": "1.0.21",
"resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.21.tgz", "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.21.tgz",
@@ -1798,6 +1845,48 @@
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==", "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/jszip": {
"version": "3.10.1",
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
"integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==",
"license": "(MIT OR GPL-3.0-or-later)",
"dependencies": {
"lie": "~3.3.0",
"pako": "~1.0.2",
"readable-stream": "~2.3.6",
"setimmediate": "^1.0.5"
}
},
"node_modules/jszip/node_modules/readable-stream": {
"version": "2.3.8",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz",
"integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==",
"license": "MIT",
"dependencies": {
"core-util-is": "~1.0.0",
"inherits": "~2.0.3",
"isarray": "~1.0.0",
"process-nextick-args": "~2.0.0",
"safe-buffer": "~5.1.1",
"string_decoder": "~1.1.1",
"util-deprecate": "~1.0.1"
}
},
"node_modules/jszip/node_modules/safe-buffer": {
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
"license": "MIT"
},
"node_modules/jszip/node_modules/string_decoder": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
"license": "MIT",
"dependencies": {
"safe-buffer": "~5.1.0"
}
},
"node_modules/kuler": { "node_modules/kuler": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz", "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
@@ -1857,6 +1946,15 @@
} }
} }
}, },
"node_modules/lie": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz",
"integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==",
"license": "MIT",
"dependencies": {
"immediate": "~3.0.5"
}
},
"node_modules/logform": { "node_modules/logform": {
"version": "2.7.0", "version": "2.7.0",
"resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz", "resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz",
@@ -2062,12 +2160,24 @@
"node": ">=8" "node": ">=8"
} }
}, },
"node_modules/pako": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==",
"license": "(MIT AND Zlib)"
},
"node_modules/platform": { "node_modules/platform": {
"version": "1.3.6", "version": "1.3.6",
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz", "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==", "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/process-nextick-args": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
"integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==",
"license": "MIT"
},
"node_modules/protobufjs": { "node_modules/protobufjs": {
"version": "7.5.4", "version": "7.5.4",
"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz",
@@ -2162,6 +2272,31 @@
"node": ">=10" "node": ">=10"
} }
}, },
"node_modules/selenium-webdriver": {
"version": "4.40.0",
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.40.0.tgz",
"integrity": "sha512-dU0QbnVKdPmoNP8OtMCazRdtU2Ux6Wl4FEpG1iwUbDeajJK1dBAywBLrC1D7YFRtogHzN96AbXBgBAJaarcysw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/SeleniumHQ"
},
{
"type": "opencollective",
"url": "https://opencollective.com/selenium"
}
],
"license": "Apache-2.0",
"dependencies": {
"@bazel/runfiles": "^6.5.0",
"jszip": "^3.10.1",
"tmp": "^0.2.5",
"ws": "^8.18.3"
},
"engines": {
"node": ">= 20.0.0"
}
},
"node_modules/semver": { "node_modules/semver": {
"version": "7.7.3", "version": "7.7.3",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
@@ -2195,6 +2330,12 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/setimmediate": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz",
"integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==",
"license": "MIT"
},
"node_modules/sharp": { "node_modules/sharp": {
"version": "0.34.5", "version": "0.34.5",
"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz",
@@ -2303,6 +2444,15 @@
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==", "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/tmp": {
"version": "0.2.5",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
"integrity": "sha512-voyz6MApa1rQGUxT3E+BK7/ROe8itEx7vD8/HEvt4xwXucvQ5G5oeEiHkmHZJuBO21RpOf+YYm9MOivj709jow==",
"license": "MIT",
"engines": {
"node": ">=14.14"
}
},
"node_modules/triple-beam": { "node_modules/triple-beam": {
"version": "1.4.1", "version": "1.4.1",
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz", "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
@@ -2423,6 +2573,27 @@
"node": ">= 12.0.0" "node": ">= 12.0.0"
} }
}, },
"node_modules/ws": {
"version": "8.19.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz",
"integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/yallist": { "node_modules/yallist": {
"version": "5.0.0", "version": "5.0.0",
"resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
+2
View File
@@ -20,10 +20,12 @@
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"langchain": "^1.2.14", "langchain": "^1.2.14",
"selenium-webdriver": "^4.40.0",
"winston": "^3.19.0" "winston": "^3.19.0"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^25.1.0", "@types/node": "^25.1.0",
"@types/selenium-webdriver": "^4.35.5",
"tsx": "^4.21.0" "tsx": "^4.21.0"
} }
} }
+9
View File
@@ -0,0 +1,9 @@
import fs from "fs";
export function hydratePrompt(path: string, replacement: string) {
// TODO: expand into full context-based replacement engine
let raw = fs.readFileSync("prompts/" + path, "utf-8");
return raw.replace("###", replacement)
}
+19
View File
@@ -0,0 +1,19 @@
You are part of an agent in a process to tack state-sponsored disinformation
In order for the following debunk articles to be automatically referenced below an offensive post, the main offensive statement should be extracted, so it can be run in a semantic matcher
Some of the data comes from debunk datasets, please remove any references to that
Reduce this title from a disinformation tracking api to a short concise claim
Make all parts of the claim definite
For example:
Something could have potentially happened BECOMES something happened
DISINFORMATION CLAIM: something is NOT true BECOMES something is true
Relevent examples are included in preceeding messages, use these as exact inspiration.
The claim to normalize is:
###
Produce no other text other than the condensed claim.
+39 -26
View File
@@ -3,21 +3,29 @@ import fs from "fs";
import { pipeline, cos_sim } from "@huggingface/transformers"; import { pipeline, cos_sim } from "@huggingface/transformers";
import { logger } from "../../utils/logger"; import { logger } from "../../utils/logger";
const CSV_PATH = "./tools/clan/dev-eng.csv"; const CSV_PATHS = [
const CACHE_PATH = "./tools/clan/dev-eng.embeddings.json"; "./tools/clan/dev-eng.csv",
// "./tools/clan/test-eng.csv",
"./tools/clan/train-eng.csv",
];
const CACHE_PATH = "./tools/clan/dev.embeddings.json";
type EmbeddingCache = { type EmbeddingCache = {
texts: string[]; rawtexts: string[];
cleantexts: string[];
embeddings: number[][]; embeddings: number[][];
}; };
export type NormalisedMatch = { export type NormalisedMatch = {
index: number; index: number;
score: number; score: number;
text: string rawtext: string;
cleantext: string;
}; };
let texts: string[] = []; let rawtexts: string[] = [];
let cleantexts: string[] = [];
let embeddings: number[][] = []; let embeddings: number[][] = [];
const featureExtractor = await pipeline( const featureExtractor = await pipeline(
@@ -33,20 +41,23 @@ async function loadOrBuildCache(): Promise<void> {
const raw = fs.readFileSync(CACHE_PATH, "utf-8"); const raw = fs.readFileSync(CACHE_PATH, "utf-8");
const cache: EmbeddingCache = JSON.parse(raw); const cache: EmbeddingCache = JSON.parse(raw);
texts = cache.texts; rawtexts = cache.rawtexts;
cleantexts = cache.cleantexts;
embeddings = cache.embeddings.map(e => Array.from(e)); embeddings = cache.embeddings.map(e => Array.from(e));
logger.info("Loaded %s embeddings", embeddings.length); logger.info("Loaded %s embeddings", embeddings.length);
return; return;
} }
logger.warn("Cache not found. Generating embeddings", embeddings.length); logger.warn("Cache not found. Generating embeddings");
await buildCacheFromCSV(); for (const csvPath of CSV_PATHS) {
await buildCacheFromCSV(csvPath);
}
const cache: EmbeddingCache = { const cache: EmbeddingCache = {
texts, rawtexts,
cleantexts,
embeddings, embeddings,
}; };
@@ -55,10 +66,12 @@ async function loadOrBuildCache(): Promise<void> {
logger.info("Cached %s embeddings", embeddings.length); logger.info("Cached %s embeddings", embeddings.length);
} }
async function buildCacheFromCSV(): Promise<void> { async function buildCacheFromCSV(csvPath: string): Promise<void> {
let count = 0; let count = 0;
const stream = fs.createReadStream(CSV_PATH).pipe(parse()); logger.info("Processing CSV: %s", csvPath);
const stream = fs.createReadStream(csvPath).pipe(parse());
for await (const row of stream) { for await (const row of stream) {
const text = row[0]; const text = row[0];
@@ -69,19 +82,27 @@ async function buildCacheFromCSV(): Promise<void> {
normalize: true, normalize: true,
}); });
texts.push(text); rawtexts.push(text);
cleantexts.push(row[1]);
const vector = Array.from(output.data as Float32Array); const vector = Array.from(output.data as Float32Array);
embeddings.push(vector); embeddings.push(vector);
count++; count++;
if (count % 100 === 0) { if (count % 100 === 0) {
logger.info("Processed %s", count); logger.info("[%s] Processed %s rows", csvPath, count);
}
} }
} }
export async function calculateSimilarity(query: string,topK = 5): Promise<NormalisedMatch[]> { logger.info("[%s] Finished (%s rows)", csvPath, count);
}
export async function calculateSimilarity(
query: string,
topK = 5
): Promise<NormalisedMatch[]> {
await loadOrBuildCache()
const queryEmbedding = await featureExtractor(query, { const queryEmbedding = await featureExtractor(query, {
pooling: "mean", pooling: "mean",
normalize: true, normalize: true,
@@ -91,17 +112,9 @@ export async function calculateSimilarity(query: string,topK = 5): Promise<Norma
.map((embedding, index) => ({ .map((embedding, index) => ({
index, index,
score: cos_sim(embedding, queryEmbedding.data as number[]), score: cos_sim(embedding, queryEmbedding.data as number[]),
text: texts[index], rawtext: rawtexts[index],
cleantext: cleantexts[index]
})) }))
.sort((a, b) => b.score - a.score) .sort((a, b) => b.score - a.score)
.slice(0, topK); .slice(0, topK);
} }
//TEMP: testing code
await loadOrBuildCache();
const results = await calculateSimilarity(
"Wonderful to see London has taken a stand to defend freedom and the right to choose."
);
console.log(results);
+29
View File
@@ -0,0 +1,29 @@
import { Builder, Browser } from "selenium-webdriver";
import firefox from "selenium-webdriver/firefox";
async function extractWebpageContent(url: string) : Promise<string>{
const options = new firefox.Options();
options.addArguments("--headless");
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
try {
await driver.get(url)
await driver.wait(async () => {
return await driver.executeScript(
"return document.readyState === 'complete'"
);
}, 5000);
const readableText = await driver.executeScript(
"return document.body.innerText;"
) as string;
return readableText
} finally {
await driver.quit()
}
}
//TODO: Extract, rank snippets
//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))