Update documentation. Stop storing context. Decide on final claims source

2026-03-25 14:24:55 +00:00
parent 872346c657
commit a7f5978f64
6 changed files with 34 additions and 8 deletions
@@ -1,3 +1,15 @@
+# Classifier work for evaluating model quality
+
+Made using a dataset of 1000 labeled claims from MVP pipeline.
+
+Roberta model trained on an augmented dataset with LLM generated adversarial examples for low frequency labels.
+
+Flan model trained using raw labelled claims, inherrent natural language ability allows for pattern recognition without the need for fake data.
+
+Regression model trained using the roberta dataset.
+
+Used ensemble model in the final version, with the component models available on Hugging Face. 
+
 | Model                                                      | % Correct | % Valid taken forward|Used in ensemble|Link
 |------------------------------------------------------------|-----------|----------------------|----------------|-
 | Original                                                   | 53.22     | 61.72                |
@@ -16,18 +16,18 @@ BASE_URL = "https://dbkf.ontotext.com/rest-api/search/documents"

 # "documentTypes": "http://schema.org/Claim",
 DEFAULT_PARAMS = [
-    ("concept", "http://weverify.eu/resource/Concept/Q212"),
+    ("documentTypes", "http://schema.org/Claim"),
    ("from", "2000-01-01"),
    ("to", "2026-02-19"),
    ("lang", "en"),
-    ("limit", 5000),
+    ("limit", 7000),
    ("page", 1),
    ("orderBy", "date"),
+    ("organization", "http://weverify.eu/resource/Organization/128573c5d49d37558706194e755f152d"), # Science Direct
    ("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
-    ("organization", "http://weverify.eu/resource/Organization/c71953fa6cf24ac4178f751c77862070"), # CheckYourFact
 ]

-NUM_RANDOM_CLAIMS = 40
+NUM_RANDOM_CLAIMS = 200

 INPUT_FILE = "../../data/input.jsonl"
 OUTPUT_FILE = "../../data/claims.json"