diff --git a/README.md b/README.md index 68fc130..97d1b68 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ Final Dissertation Submission Repository | ├── claims.json # Retreived claims from dbkf fetcher | ├── dev-eng.csv | ├── train-eng.csv # Normalized disinformation claims in CSV format from CLAN +| ├── Iffy.json # Iffy dataset of disinformation domains | ├── input.jsonl # Response in cleaned format to give as context to agent | ├── ranked.jsonl # Cleaned trigger event response from scorer frontend | └── results.jsonl # Output from wrapper script, read and modified by scorer diff --git a/agent/tools/retreiveExamples.ts b/agent/tools/retreiveExamples.ts index 27cbf39..58067a6 100644 --- a/agent/tools/retreiveExamples.ts +++ b/agent/tools/retreiveExamples.ts @@ -244,22 +244,27 @@ async function ensureExampleClaimJsonlLoaded(): Promise { input: stream, crlfDelay: Infinity, }); + let skipped = 0; for await (const line of rl) { if (!line.trim()) continue; // skip empty lines const row = JSON.parse(line); + const parsed_content = row.events; + + const filtered_content = parsed_content.filter(itm => itm.human_score > 0.5 && itm.score > 0.5) + + if (filtered_content.length == 0) { + skipped++; + continue; + } + const text = row.text; const embedding = await embedText(text); jsonlRawtexts.push(text); - - const parsed_content = row.events; - - const filtered_content = parsed_content.filter(itm => itm.human_score > 0.5 && itm.score > 0.5) - jsonlCleantexts.push(JSON.stringify(filtered_content)); jsonlEmbeddings.push(embedding); } @@ -268,7 +273,7 @@ async function ensureExampleClaimJsonlLoaded(): Promise { jsonlBM25 = buildBM25(jsonlRawtexts); jsonlLoaded = true; - logger.info("JSONL ranking done"); + logger.info("JSONL ranking done, %s items skipped for having no good events", skipped); }