Update documentation. Stop storing context. Decide on final claims source

This commit is contained in:
William Jeynes
2026-03-25 14:24:55 +00:00
parent 872346c657
commit a7f5978f64
6 changed files with 34 additions and 8 deletions
+12
View File
@@ -1,3 +1,15 @@
# Classifier work for evaluating model quality
Made using a dataset of 1000 labeled claims from MVP pipeline.
Roberta model trained on an augmented dataset with LLM generated adversarial examples for low frequency labels.
Flan model trained using raw labelled claims, inherrent natural language ability allows for pattern recognition without the need for fake data.
Regression model trained using the roberta dataset.
Used ensemble model in the final version, with the component models available on Hugging Face.
| Model | % Correct | % Valid taken forward|Used in ensemble|Link
|------------------------------------------------------------|-----------|----------------------|----------------|-
| Original | 53.22 | 61.72 |
+4 -4
View File
@@ -16,18 +16,18 @@ BASE_URL = "https://dbkf.ontotext.com/rest-api/search/documents"
# "documentTypes": "http://schema.org/Claim",
DEFAULT_PARAMS = [
("concept", "http://weverify.eu/resource/Concept/Q212"),
("documentTypes", "http://schema.org/Claim"),
("from", "2000-01-01"),
("to", "2026-02-19"),
("lang", "en"),
("limit", 5000),
("limit", 7000),
("page", 1),
("orderBy", "date"),
("organization", "http://weverify.eu/resource/Organization/128573c5d49d37558706194e755f152d"), # Science Direct
("organization", "http://weverify.eu/resource/Organization/3727f7b2aa90ec0716693e5464b28d18"), # StopFake
("organization", "http://weverify.eu/resource/Organization/c71953fa6cf24ac4178f751c77862070"), # CheckYourFact
]
NUM_RANDOM_CLAIMS = 40
NUM_RANDOM_CLAIMS = 200
INPUT_FILE = "../../data/input.jsonl"
OUTPUT_FILE = "../../data/claims.json"