Files

92 lines
2.6 KiB
TypeScript

import axios from "axios";
import { checkDisinfo } from "./checkDisinfo";
import { writeToJSONL } from "../utils/writeToJSONL";
import { backOff } from "exponential-backoff";
import { logger } from "../utils/logger";
export async function queryScraper(query: string): Promise<string[]> {
try {
const response = await backOff(async () => {
return await queryScraperWorker(query);
}, {
numOfAttempts: 10,
startingDelay: 500,
timeMultiple: 2,
jitter: "full",
maxDelay: 50000,
})
return response;
}
catch (err: any) {
logger.error("Failed out of retry loop, returning placeholder to pipeline")
return ["API EXCEPTION"]
}
}
async function queryScraperWorker(query: string): Promise<string[]> {
const instance = process.env.SCRAPER_INSTANCE;
if (!instance) {
throw new Error("SCRAPER_INSTANCE environment variable is not set");
}
const cleanQuery = query.replace(/[^A-Za-z0-9 ]+/g, "");
const url = `${instance}/api/v1/web`;
const params: Record<string, string> = Object.entries(process.env)
.filter(([key, value]) => key.startsWith("SCRAPER_PARAM_") && value !== undefined)
.reduce((acc: Record<string, string>, [key, value]) => {
const paramName = key.replace(/^SCRAPER_PARAM_/, "").toLowerCase();
acc[paramName] = value!;
return acc;
}, {});
params.s = cleanQuery;
let response;
try {
response = await axios.get(url, { params });
} catch (err: any) {
if (err.response) {
const desc = `HTTP error ${err.response.status}: ${JSON.stringify(err.response.data)}`
logger.error(desc)
throw new Error(desc);
}
throw err;
}
const data = response.data;
if (data?.status !== "ok") {
const desc = `API returned status: ${data?.status}`;
logger.error(desc)
throw new Error(desc);
}
// TEMP?: Convert API results to array of formatted strings.
const context = data.web ?? [];
const lines: string[] = context.map((item: any) => {
if (checkDisinfo(item.url)) {
writeToJSONL("blocked.jsonl", { url: item.url, query: query })
return "";
}
const title = (item.title ?? "").trim();
const desc = (item.description ?? "").trim();
const link = (item.url ?? "").trim();
return `^^^ ${title}\n ${desc}\n ${link}`;
});
return lines;
}
// import dotenv from "dotenv";
// dotenv.config();
// console.log(await queryScraper("sir kier starmer"))