44 lines
1.2 KiB
TypeScript
44 lines
1.2 KiB
TypeScript
import { pipeline, cos_sim } from "@huggingface/transformers";
|
|
|
|
let featureExtractor = await pipeline(
|
|
"feature-extraction",
|
|
"Xenova/all-MiniLM-L6-v2"
|
|
);
|
|
|
|
export async function removeDuplicates(state: any) {
|
|
const embeddings: number[][] = [];
|
|
|
|
const outputs = await featureExtractor(
|
|
state.map(s => s.Event),
|
|
{ pooling: "mean", normalize: true }
|
|
);
|
|
|
|
for (const o of outputs) {
|
|
embeddings.push(Array.from(o.data));
|
|
}
|
|
|
|
const len = state.length;
|
|
for (let i = 0; i < len; i++) {
|
|
for (let j = i + 1; j < len; j++) {
|
|
if (state[i].score === -1 || state[j].score === -1) continue;
|
|
|
|
const sim = cos_sim(embeddings[i], embeddings[j]);
|
|
console.log(sim)
|
|
if (sim > 0.55) {
|
|
const scoreI = state[i].score ?? 0;
|
|
const scoreJ = state[j].score ?? 0;
|
|
|
|
if (scoreI > scoreJ) {
|
|
state[j].score = -1;
|
|
} else if (scoreJ > scoreI) {
|
|
state[i].score = -1;
|
|
} else {
|
|
// if equal, keep earlier
|
|
state[j].score = -1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return state;
|
|
} |