Add offset and limit in pereparation for the large dataset

2026-04-05 22:47:25 +01:00
parent 00d129bd28
commit 75ca1032a6
3 changed files with 16 additions and 5 deletions
@@ -30,4 +30,3 @@ Experiments with different model types:
 | gpt-5.4-nano                  | 6/210     | 2.85     |
 | gpt-4.1-mini                  | 15/269    | 5.57     |
 | gpt-4o-mini                   | 27/287    | 9.407    |
-| llama3.1:8b-instruct-q4_K_M   | ?         | ?        |
@@ -19,6 +19,9 @@ const MODE = process.env.MODE ?? "claim";

 const MAX_CONCURRENCY = 5;

+const OFFSET = parseInt(process.env.OFFSET ?? "0", 10);
+const LIMIT = process.env.LIMIT ? parseInt(process.env.LIMIT, 10) : null;
+
 const client = new Client({ apiUrl: API_URL });


@@ -164,9 +167,18 @@ async function processRecord(record: any): Promise<ResultRecord> {
 async function main() {
  console.log("Reading input file...");

-  const records = await loadInputs();
+  const allRecords = await loadInputs();

-  console.log(`Loaded ${records.length} records`);
+  console.log(`Loaded ${allRecords.length} records`);
+
+  const records = allRecords.slice(
+    OFFSET,
+    LIMIT !== null ? OFFSET + LIMIT : undefined
+  );
+
+  console.log(
+    `Processing ${records.length} records (offset=${OFFSET}, limit=${LIMIT ?? "∞"})`
+  );
  
  fs.writeFileSync(OUTPUT_FILE, "", { flag: "a" });

@@ -27,7 +27,7 @@ DEFAULT_PARAMS = [
    ("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
 ]

-NUM_RANDOM_CLAIMS = 200
+NUM_RANDOM_CLAIMS = 2000

 INPUT_FILE = "../../data/input.jsonl"
 OUTPUT_FILE = "../../data/claims.json"