Allow multiple source CSV files for normalisation. Implement real model node. Add normalizarion prompt. Implement normalization setup. Start on RAG retreival functions
This commit is contained in:
+6
-4
@@ -8,14 +8,16 @@ import { createDummyModelNode } from "./nodes/dummyModel";
|
||||
import { verificationSetup } from "./nodes/verificationSetup";
|
||||
import { dummyRagasMetrics } from "./nodes/dummyRagasMetrics";
|
||||
import { produceRanking } from "./nodes/produceRanking";
|
||||
import { createModelNode } from "./nodes/model";
|
||||
|
||||
const triggerEventToolNode = createToolNode(arithmeticToolsByName);
|
||||
const verificationToolNode = createToolNode(arithmeticToolsByName);
|
||||
|
||||
const dummyTriggerEventModel = createDummyModelNode("Trigger Events of");
|
||||
const dummyNormalisationModel = createDummyModelNode("Normalised");
|
||||
const dummyVerificationModel = createDummyModelNode("verification of");
|
||||
|
||||
const normalisationModel = createModelNode([], "normalization.txt");
|
||||
|
||||
const triggerEventToolConditional = createToolConditional("triggerEventToolNode", verificationSetup.name);
|
||||
const verificationToolConditional = createToolConditional("verificationToolNode", produceRanking.name);
|
||||
|
||||
@@ -25,7 +27,7 @@ const agent = new StateGraph(MessagesState)
|
||||
//NODES
|
||||
|
||||
.addNode(normalizationSetup.name, normalizationSetup)
|
||||
.addNode("dummyNormalisationModel", dummyNormalisationModel)
|
||||
.addNode("normalisationModel", normalisationModel)
|
||||
|
||||
.addNode("triggerEventToolNode", triggerEventToolNode)
|
||||
.addNode("dummyTriggerEventModel", dummyTriggerEventModel)
|
||||
@@ -37,8 +39,8 @@ const agent = new StateGraph(MessagesState)
|
||||
.addNode(produceRanking.name, produceRanking)
|
||||
|
||||
.addEdge(START, normalizationSetup.name)
|
||||
.addEdge(normalizationSetup.name, "dummyNormalisationModel")
|
||||
.addEdge("dummyNormalisationModel", "dummyTriggerEventModel")
|
||||
.addEdge(normalizationSetup.name, "normalisationModel")
|
||||
.addEdge("normalisationModel", "dummyTriggerEventModel")
|
||||
|
||||
// @ts-expect-error
|
||||
.addConditionalEdges("dummyTriggerEventModel", triggerEventToolConditional, ["triggerEventToolNode", verificationSetup.name])
|
||||
|
||||
+24
-21
@@ -1,24 +1,27 @@
|
||||
// import { SystemMessage } from "@langchain/core/messages";
|
||||
// import { GraphNode } from "@langchain/langgraph";
|
||||
// import { MessagesState } from "../state";
|
||||
// import { arithmeticTools } from "../tools/arithmetic";
|
||||
// import { ChatOpenAI } from "@langchain/openai"
|
||||
import { HumanMessage, SystemMessage } from "@langchain/core/messages";
|
||||
import { GraphNode } from "@langchain/langgraph";
|
||||
import { MessagesState } from "../state";
|
||||
import { ChatOpenAI } from "@langchain/openai"
|
||||
import { hydratePrompt } from "../prompts/hydratePrompt";
|
||||
|
||||
// const model = new ChatOpenAI({
|
||||
// model: "gpt-5-mini"
|
||||
// });
|
||||
export function createModelNode(tools: any, promptPath: string): GraphNode<typeof MessagesState> {
|
||||
return async (state) => {
|
||||
const sysPrompt = hydratePrompt(promptPath, state.disinformationTitle)
|
||||
|
||||
// const modelWithTools = model.bindTools(arithmeticTools);
|
||||
const model = new ChatOpenAI({
|
||||
model: "gpt-5-mini"
|
||||
});
|
||||
const modelWithTools = model.bindTools(tools);
|
||||
|
||||
// export const llmCall: GraphNode<typeof MessagesState> = async (state) => {
|
||||
// const response = await modelWithTools.invoke([
|
||||
// new SystemMessage(
|
||||
// "You are a helpful assistant tasked with performing arithmetic on a set of inputs. Any calculation, no matter how trivial, should be done with tools. Output the final answer with %%% on each side"
|
||||
// ),
|
||||
// ...state.messages,
|
||||
// ]);
|
||||
// return {
|
||||
// messages: [response],
|
||||
// llmCalls: 1,
|
||||
// };
|
||||
// };
|
||||
const response = await modelWithTools.invoke([
|
||||
new SystemMessage(
|
||||
sysPrompt
|
||||
),
|
||||
...state.messages,
|
||||
]);
|
||||
|
||||
return {
|
||||
messages: [response]
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,9 +1,16 @@
|
||||
import { GraphNode } from "@langchain/langgraph";
|
||||
import { MessagesState } from "../state";
|
||||
import { HumanMessage } from "@langchain/core/messages";
|
||||
import { AIMessage, BaseMessage, HumanMessage } from "@langchain/core/messages";
|
||||
import { calculateSimilarity } from "../tools/clan/retreiveExamples";
|
||||
|
||||
export const normalizationSetup: GraphNode<typeof MessagesState> = async (state) => {
|
||||
//TODO: Implement claim normalisation, using few shot prompting and CLAN Dataset
|
||||
let similarityResults = await calculateSimilarity(state.disinformationTitle)
|
||||
|
||||
return { messages: [ new HumanMessage(state.disinformationTitle)] };
|
||||
console.log(similarityResults)
|
||||
|
||||
let messages : BaseMessage[] = similarityResults.map((item) => {
|
||||
return new AIMessage(`Original Claim: ${item.rawtext}. \n\n Normalised Claim: ${item.cleantext}`)
|
||||
})
|
||||
|
||||
return { messages: messages, disinformationTitle: state.disinformationTitle };
|
||||
};
|
||||
Generated
+171
@@ -19,13 +19,21 @@
|
||||
"dotenv": "^17.2.3",
|
||||
"fs": "^0.0.1-security",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"winston": "^3.19.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^25.1.0",
|
||||
"@types/selenium-webdriver": "^4.35.5",
|
||||
"tsx": "^4.21.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@bazel/runfiles": {
|
||||
"version": "6.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@bazel/runfiles/-/runfiles-6.5.0.tgz",
|
||||
"integrity": "sha512-RzahvqTkfpY2jsDxo8YItPX+/iZ6hbiikw1YhE0bA9EKBR5Og8Pa6FHn9PO9M0zaXRVsr0GFQLKbB/0rzy9SzA==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/@cfworker/json-schema": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz",
|
||||
@@ -1243,6 +1251,17 @@
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/selenium-webdriver": {
|
||||
"version": "4.35.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/selenium-webdriver/-/selenium-webdriver-4.35.5.tgz",
|
||||
"integrity": "sha512-wCQCjWmahRkUAO7S703UAvBFkxz4o/rjX4T2AOSWKXSi0sTQPsrXxR0GjtFUT0ompedLkYH4R5HO5Urz0hyeog==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*",
|
||||
"@types/ws": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/triple-beam": {
|
||||
"version": "1.3.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz",
|
||||
@@ -1255,6 +1274,16 @@
|
||||
"integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/ws": {
|
||||
"version": "8.18.1",
|
||||
"resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
|
||||
"integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/ansi-styles": {
|
||||
"version": "5.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz",
|
||||
@@ -1463,6 +1492,12 @@
|
||||
"simple-wcswidth": "^1.1.2"
|
||||
}
|
||||
},
|
||||
"node_modules/core-util-is": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
|
||||
"integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/csv-parse": {
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz",
|
||||
@@ -1753,6 +1788,12 @@
|
||||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/immediate": {
|
||||
"version": "3.0.6",
|
||||
"resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz",
|
||||
"integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/inherits": {
|
||||
"version": "2.0.4",
|
||||
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
|
||||
@@ -1783,6 +1824,12 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/isarray": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
|
||||
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/js-tiktoken": {
|
||||
"version": "1.0.21",
|
||||
"resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.21.tgz",
|
||||
@@ -1798,6 +1845,48 @@
|
||||
"integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/jszip": {
|
||||
"version": "3.10.1",
|
||||
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz",
|
||||
"integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==",
|
||||
"license": "(MIT OR GPL-3.0-or-later)",
|
||||
"dependencies": {
|
||||
"lie": "~3.3.0",
|
||||
"pako": "~1.0.2",
|
||||
"readable-stream": "~2.3.6",
|
||||
"setimmediate": "^1.0.5"
|
||||
}
|
||||
},
|
||||
"node_modules/jszip/node_modules/readable-stream": {
|
||||
"version": "2.3.8",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz",
|
||||
"integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"core-util-is": "~1.0.0",
|
||||
"inherits": "~2.0.3",
|
||||
"isarray": "~1.0.0",
|
||||
"process-nextick-args": "~2.0.0",
|
||||
"safe-buffer": "~5.1.1",
|
||||
"string_decoder": "~1.1.1",
|
||||
"util-deprecate": "~1.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/jszip/node_modules/safe-buffer": {
|
||||
"version": "5.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
|
||||
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/jszip/node_modules/string_decoder": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
||||
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"safe-buffer": "~5.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/kuler": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
|
||||
@@ -1857,6 +1946,15 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/lie": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz",
|
||||
"integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"immediate": "~3.0.5"
|
||||
}
|
||||
},
|
||||
"node_modules/logform": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz",
|
||||
@@ -2062,12 +2160,24 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/pako": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz",
|
||||
"integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==",
|
||||
"license": "(MIT AND Zlib)"
|
||||
},
|
||||
"node_modules/platform": {
|
||||
"version": "1.3.6",
|
||||
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
|
||||
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/process-nextick-args": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
|
||||
"integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/protobufjs": {
|
||||
"version": "7.5.4",
|
||||
"resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz",
|
||||
@@ -2162,6 +2272,31 @@
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/selenium-webdriver": {
|
||||
"version": "4.40.0",
|
||||
"resolved": "https://registry.npmjs.org/selenium-webdriver/-/selenium-webdriver-4.40.0.tgz",
|
||||
"integrity": "sha512-dU0QbnVKdPmoNP8OtMCazRdtU2Ux6Wl4FEpG1iwUbDeajJK1dBAywBLrC1D7YFRtogHzN96AbXBgBAJaarcysw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/SeleniumHQ"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/selenium"
|
||||
}
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@bazel/runfiles": "^6.5.0",
|
||||
"jszip": "^3.10.1",
|
||||
"tmp": "^0.2.5",
|
||||
"ws": "^8.18.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 20.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
"version": "7.7.3",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
|
||||
@@ -2195,6 +2330,12 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/setimmediate": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz",
|
||||
"integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/sharp": {
|
||||
"version": "0.34.5",
|
||||
"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz",
|
||||
@@ -2303,6 +2444,15 @@
|
||||
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tmp": {
|
||||
"version": "0.2.5",
|
||||
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.5.tgz",
|
||||
"integrity": "sha512-voyz6MApa1rQGUxT3E+BK7/ROe8itEx7vD8/HEvt4xwXucvQ5G5oeEiHkmHZJuBO21RpOf+YYm9MOivj709jow==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=14.14"
|
||||
}
|
||||
},
|
||||
"node_modules/triple-beam": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
|
||||
@@ -2423,6 +2573,27 @@
|
||||
"node": ">= 12.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.19.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz",
|
||||
"integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/yallist": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
|
||||
|
||||
@@ -20,10 +20,12 @@
|
||||
"dotenv": "^17.2.3",
|
||||
"fs": "^0.0.1-security",
|
||||
"langchain": "^1.2.14",
|
||||
"selenium-webdriver": "^4.40.0",
|
||||
"winston": "^3.19.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^25.1.0",
|
||||
"@types/selenium-webdriver": "^4.35.5",
|
||||
"tsx": "^4.21.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
import fs from "fs";
|
||||
|
||||
export function hydratePrompt(path: string, replacement: string) {
|
||||
// TODO: expand into full context-based replacement engine
|
||||
|
||||
let raw = fs.readFileSync("prompts/" + path, "utf-8");
|
||||
|
||||
return raw.replace("###", replacement)
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
You are part of an agent in a process to tack state-sponsored disinformation
|
||||
|
||||
In order for the following debunk articles to be automatically referenced below an offensive post, the main offensive statement should be extracted, so it can be run in a semantic matcher
|
||||
|
||||
Some of the data comes from debunk datasets, please remove any references to that
|
||||
|
||||
Reduce this title from a disinformation tracking api to a short concise claim
|
||||
|
||||
Make all parts of the claim definite
|
||||
For example:
|
||||
Something could have potentially happened BECOMES something happened
|
||||
DISINFORMATION CLAIM: something is NOT true BECOMES something is true
|
||||
|
||||
Relevent examples are included in preceeding messages, use these as exact inspiration.
|
||||
|
||||
The claim to normalize is:
|
||||
###
|
||||
|
||||
Produce no other text other than the condensed claim.
|
||||
@@ -3,21 +3,29 @@ import fs from "fs";
|
||||
import { pipeline, cos_sim } from "@huggingface/transformers";
|
||||
import { logger } from "../../utils/logger";
|
||||
|
||||
const CSV_PATH = "./tools/clan/dev-eng.csv";
|
||||
const CACHE_PATH = "./tools/clan/dev-eng.embeddings.json";
|
||||
const CSV_PATHS = [
|
||||
"./tools/clan/dev-eng.csv",
|
||||
// "./tools/clan/test-eng.csv",
|
||||
"./tools/clan/train-eng.csv",
|
||||
];
|
||||
|
||||
const CACHE_PATH = "./tools/clan/dev.embeddings.json";
|
||||
|
||||
type EmbeddingCache = {
|
||||
texts: string[];
|
||||
rawtexts: string[];
|
||||
cleantexts: string[];
|
||||
embeddings: number[][];
|
||||
};
|
||||
|
||||
export type NormalisedMatch = {
|
||||
index: number;
|
||||
score: number;
|
||||
text: string
|
||||
rawtext: string;
|
||||
cleantext: string;
|
||||
};
|
||||
|
||||
let texts: string[] = [];
|
||||
let rawtexts: string[] = [];
|
||||
let cleantexts: string[] = [];
|
||||
let embeddings: number[][] = [];
|
||||
|
||||
const featureExtractor = await pipeline(
|
||||
@@ -33,20 +41,23 @@ async function loadOrBuildCache(): Promise<void> {
|
||||
const raw = fs.readFileSync(CACHE_PATH, "utf-8");
|
||||
const cache: EmbeddingCache = JSON.parse(raw);
|
||||
|
||||
texts = cache.texts;
|
||||
|
||||
rawtexts = cache.rawtexts;
|
||||
cleantexts = cache.cleantexts;
|
||||
embeddings = cache.embeddings.map(e => Array.from(e));
|
||||
|
||||
logger.info("Loaded %s embeddings", embeddings.length);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.warn("Cache not found. Generating embeddings", embeddings.length);
|
||||
logger.warn("Cache not found. Generating embeddings");
|
||||
|
||||
await buildCacheFromCSV();
|
||||
for (const csvPath of CSV_PATHS) {
|
||||
await buildCacheFromCSV(csvPath);
|
||||
}
|
||||
|
||||
const cache: EmbeddingCache = {
|
||||
texts,
|
||||
rawtexts,
|
||||
cleantexts,
|
||||
embeddings,
|
||||
};
|
||||
|
||||
@@ -55,10 +66,12 @@ async function loadOrBuildCache(): Promise<void> {
|
||||
logger.info("Cached %s embeddings", embeddings.length);
|
||||
}
|
||||
|
||||
async function buildCacheFromCSV(): Promise<void> {
|
||||
async function buildCacheFromCSV(csvPath: string): Promise<void> {
|
||||
let count = 0;
|
||||
|
||||
const stream = fs.createReadStream(CSV_PATH).pipe(parse());
|
||||
logger.info("Processing CSV: %s", csvPath);
|
||||
|
||||
const stream = fs.createReadStream(csvPath).pipe(parse());
|
||||
|
||||
for await (const row of stream) {
|
||||
const text = row[0];
|
||||
@@ -69,19 +82,27 @@ async function buildCacheFromCSV(): Promise<void> {
|
||||
normalize: true,
|
||||
});
|
||||
|
||||
texts.push(text);
|
||||
rawtexts.push(text);
|
||||
cleantexts.push(row[1]);
|
||||
const vector = Array.from(output.data as Float32Array);
|
||||
embeddings.push(vector);
|
||||
|
||||
|
||||
count++;
|
||||
if (count % 100 === 0) {
|
||||
logger.info("Processed %s", count);
|
||||
}
|
||||
logger.info("[%s] Processed %s rows", csvPath, count);
|
||||
}
|
||||
}
|
||||
|
||||
export async function calculateSimilarity(query: string,topK = 5): Promise<NormalisedMatch[]> {
|
||||
logger.info("[%s] Finished (%s rows)", csvPath, count);
|
||||
}
|
||||
|
||||
export async function calculateSimilarity(
|
||||
query: string,
|
||||
topK = 5
|
||||
): Promise<NormalisedMatch[]> {
|
||||
await loadOrBuildCache()
|
||||
|
||||
const queryEmbedding = await featureExtractor(query, {
|
||||
pooling: "mean",
|
||||
normalize: true,
|
||||
@@ -91,17 +112,9 @@ export async function calculateSimilarity(query: string,topK = 5): Promise<Norma
|
||||
.map((embedding, index) => ({
|
||||
index,
|
||||
score: cos_sim(embedding, queryEmbedding.data as number[]),
|
||||
text: texts[index],
|
||||
rawtext: rawtexts[index],
|
||||
cleantext: cleantexts[index]
|
||||
}))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, topK);
|
||||
}
|
||||
|
||||
//TEMP: testing code
|
||||
await loadOrBuildCache();
|
||||
|
||||
const results = await calculateSimilarity(
|
||||
"Wonderful to see London has taken a stand to defend freedom and the right to choose."
|
||||
);
|
||||
|
||||
console.log(results);
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
import { Builder, Browser } from "selenium-webdriver";
|
||||
import firefox from "selenium-webdriver/firefox";
|
||||
|
||||
async function extractWebpageContent(url: string) : Promise<string>{
|
||||
const options = new firefox.Options();
|
||||
options.addArguments("--headless");
|
||||
|
||||
let driver = await new Builder().forBrowser(Browser.FIREFOX).setFirefoxOptions(options).build()
|
||||
try {
|
||||
await driver.get(url)
|
||||
await driver.wait(async () => {
|
||||
return await driver.executeScript(
|
||||
"return document.readyState === 'complete'"
|
||||
);
|
||||
}, 5000);
|
||||
|
||||
const readableText = await driver.executeScript(
|
||||
"return document.body.innerText;"
|
||||
) as string;
|
||||
|
||||
return readableText
|
||||
} finally {
|
||||
await driver.quit()
|
||||
}
|
||||
}
|
||||
|
||||
//TODO: Extract, rank snippets
|
||||
|
||||
//console.log(await extractWebpageContent("https://www.bbc.co.uk/news/live/c74wd01egvyt"))
|
||||
Reference in New Issue
Block a user