From 75ca1032a6faf27606fb403751d8f46706f10bd9 Mon Sep 17 00:00:00 2001 From: William Jeynes Date: Sun, 5 Apr 2026 22:47:25 +0100 Subject: [PATCH] Add offset and limit in pereparation for the large dataset --- agent/README.md | 3 +-- supporting/Wrapper/run.ts | 16 ++++++++++++++-- supporting/dbkf/fetch.py | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/agent/README.md b/agent/README.md index 691ed5f..f2f20ba 100644 --- a/agent/README.md +++ b/agent/README.md @@ -29,5 +29,4 @@ Experiments with different model types: | gpt-5.4-mini | 29/278 | 10.43 | | gpt-5.4-nano | 6/210 | 2.85 | | gpt-4.1-mini | 15/269 | 5.57 | -| gpt-4o-mini | 27/287 | 9.407 | -| llama3.1:8b-instruct-q4_K_M | ? | ? | \ No newline at end of file +| gpt-4o-mini | 27/287 | 9.407 | \ No newline at end of file diff --git a/supporting/Wrapper/run.ts b/supporting/Wrapper/run.ts index 50fcaf9..2565eee 100644 --- a/supporting/Wrapper/run.ts +++ b/supporting/Wrapper/run.ts @@ -19,6 +19,9 @@ const MODE = process.env.MODE ?? "claim"; const MAX_CONCURRENCY = 5; +const OFFSET = parseInt(process.env.OFFSET ?? "0", 10); +const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null; + const client = new Client({ apiUrl: API_URL }); @@ -164,10 +167,19 @@ async function processRecord(record: any): Promise { async function main() { console.log("Reading input file..."); - const records = await loadInputs(); + const allRecords = await loadInputs(); - console.log(`Loaded ${records.length} records`); + console.log(`Loaded ${allRecords.length} records`); + const records = allRecords.slice( + OFFSET, + LIMIT !== null ? OFFSET + LIMIT : undefined + ); + + console.log( + `Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})` + ); + fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" }); const limit = pLimit(MAX_CONCURRENCY); diff --git a/supporting/dbkf/fetch.py b/supporting/dbkf/fetch.py index 5ca246a..8acbbcc 100644 --- a/supporting/dbkf/fetch.py +++ b/supporting/dbkf/fetch.py @@ -27,7 +27,7 @@ DEFAULT_PARAMS = [ ("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake ] -NUM_RANDOM_CLAIMS = 200 +NUM_RANDOM_CLAIMS = 2000 INPUT_FILE = "../../data/input.jsonl" OUTPUT_FILE = "../../data/claims.json"