76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
import json
|
||
from pathlib import Path
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from threading import Lock
|
||
from tqdm import tqdm
|
||
from dotenv import load_dotenv
|
||
from openai import OpenAI
|
||
|
||
ENV_PATH = Path("../../agent/.env")
|
||
load_dotenv(dotenv_path=ENV_PATH)
|
||
|
||
client = OpenAI()
|
||
|
||
INPUT_FILE = "../../data/reranked/0_original.jsonl"
|
||
OUTPUT_FILE = "output.txt"
|
||
MODEL = "gpt-5-nano"
|
||
|
||
MAX_WORKERS = 60 # tune this
|
||
|
||
write_lock = Lock()
|
||
|
||
|
||
def make_request(line):
|
||
try:
|
||
data = json.loads(line)
|
||
prompt = (
|
||
"Provide a non specific piece of background, tallking point or other minformaiton that allowed the a disinformaition to spread; to aid in analysis and debunking"
|
||
"The topic in question is: "
|
||
+ data.get("text", "")
|
||
+ " Include just the example no other text."
|
||
+ " A good example would be"
|
||
+ "'Existing high-profile reporting and public discussion throughout 2022–2023 about foreign fighters and mercenary recruitment (including Russian recruitment and Wagner Group activity).'"
|
||
+ "Since it focusses on non-instantiated or proven discussion points"
|
||
+ " and "
|
||
+ "'2016 – Continued EU–Russia business dialogues and investments (documented by policy institutes and trade analyses) showing ongoing economic links despite political tensions'"
|
||
+ "Since it does not name a specific dialogue or investement"
|
||
+ " Be concise, make no mistakes, use similar style and wording to provided examples"
|
||
)
|
||
|
||
if not prompt:
|
||
return ""
|
||
|
||
response = client.responses.create(
|
||
model=MODEL,
|
||
input=prompt
|
||
)
|
||
|
||
text = response.output_text.strip() if response.output_text else ""
|
||
|
||
if text and "\n" not in text and "sorry" not in text.lower() and "you" not in text.lower():
|
||
return text
|
||
|
||
return ""
|
||
|
||
except Exception as e:
|
||
return ""
|
||
|
||
|
||
def process_file(input_path, output_path):
|
||
with open(input_path, "r", encoding="utf-8") as infile:
|
||
lines = list(infile)
|
||
|
||
with open(output_path, "w", encoding="utf-8") as outfile:
|
||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||
futures = [executor.submit(make_request, line) for line in lines]
|
||
|
||
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
|
||
result = future.result()
|
||
if result:
|
||
# 🔒 ensure thread-safe writes
|
||
with write_lock:
|
||
outfile.write(result + ",NSPECIFIC\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
process_file(INPUT_FILE, OUTPUT_FILE) |