Skip to content

Commit

Permalink
changed to open-source models
Browse files Browse the repository at this point in the history
  • Loading branch information
Siddhesh-Agarwal committed Jan 30, 2025
1 parent 9dae092 commit 9e415b7
Show file tree
Hide file tree
Showing 12 changed files with 2,152 additions and 3,191 deletions.
1 change: 1 addition & 0 deletions backend/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
3 changes: 1 addition & 2 deletions backend/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from core.db import add_to_db, db_is_working
from core.fact import fact_check_process
from core.preprocessors import get_image, summarize, to_english
from core.preprocessors import summarize, to_english

__all__ = [
"add_to_db",
"db_is_working",
"fact_check_process",
"get_image",
"summarize",
"to_english",
]
54 changes: 28 additions & 26 deletions backend/core/fact.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import os
from typing import Literal, TypedDict
import asyncio

from bs4 import BeautifulSoup
from groq import AsyncGroq
import instructor
import openai
import requests
import ujson
import instructor
from pydantic import BaseModel
from pymongo import AsyncMongoClient
from pymongo.typings import _DocumentType

from core.db import fetch_from_db_if_exists
from core.preprocessors import summarize
from core.db import fetch_from_db_if_exists # type: ignore
from core.postprocessors import archive_url, is_safe
from schemas import FactCheckLabel, FactCheckResponse, GPTFactCheckModel, TextInputData
from schemas.schemas import FactCheckLabel, FactCheckResponse, GPTFactCheckModel, TextInputData


class SearchResult(TypedDict):
Expand All @@ -25,13 +26,23 @@ class SearchResult(TypedDict):
async def get_content(groq_client: AsyncGroq, url: str) -> str | None:
"""returns the content of given url"""
try:
res = requests.get(url, timeout=15)
res.raise_for_status()
return await summarize(groq_client, res.text)
except Exception:
with requests.get(url, timeout=15) as res:
res.raise_for_status()
soup = BeautifulSoup(res.text, "html.parser")
return await summarize(groq_client, soup.get_text())
except requests.exceptions.RequestException:
return None


async def get_url_content(groq_client: AsyncGroq, item: dict) -> SearchResult:
content = await get_content(groq_client, str(item.get("link", "")))
return {
"title": item["title"],
"link": item["link"],
"content": content or item["snippet"],
}


async def search_tool(groq_client: AsyncGroq, query: str, num_results: int = 3):
"""Tool to search via Google CSE"""
api_key = os.getenv("GOOGLE_API_KEY", "")
Expand All @@ -43,23 +54,16 @@ async def search_tool(groq_client: AsyncGroq, query: str, num_results: int = 3):
json = ujson.loads(resp.text)
assert hasattr(resp, "items")
res: list[SearchResult] = []
for item in json["items"]:
content = await get_content(groq_client, item["link"])
res.append(
{
"title": item["title"],
"link": item["link"],
"content": content or item["snippet"],
}
)
tasks = [get_url_content(groq_client, item) for item in json["items"]]
res.extend(await asyncio.gather(*tasks))
return res


class SearchQuery(BaseModel):
query: str


async def fact_check(oai_client: openai.AsyncOpenAI, groq_client: AsyncGroq, data: TextInputData) -> GPTFactCheckModel:
async def fact_check(groq_client: AsyncGroq, data: TextInputData) -> GPTFactCheckModel:
"""
fact_check checks the data against the OpenAI API.
Expand All @@ -76,15 +80,15 @@ async def fact_check(oai_client: openai.AsyncOpenAI, groq_client: AsyncGroq, dat

claim = data.content

client = instructor.from_openai(oai_client)
client = instructor.from_groq(groq_client)

response = await client.chat.completions.create(
model="gpt-4o-mini",
model="llama-3.2-3b-preview",
response_model=SearchQuery,
messages=[
{
"role": "system",
"content": "I want you to act as a fact-check researcher. You will be given a claim and you have should search the information on a custom search engine to help in the fact checking. Frame a query using the least words possible and return only the query.",
"content": "You are a fact-check researcher whose task is to search information to help in the fact checking. Frame an appropriate query to get the most appropriate results that will aid in the fact check",
},
{
"role": "user",
Expand All @@ -98,11 +102,11 @@ async def fact_check(oai_client: openai.AsyncOpenAI, groq_client: AsyncGroq, dat

# Send the search results back to GPT for analysis
final_response = await client.chat.completions.create(
model="gpt-4o",
model="deepseek-r1-distill-llama-70b",
messages=[
{
"role": "system",
"content": "I want you to act as a fact checker. You will be given a statement along with relevant search results and you are supposed to provide a fact check based on them. You need to classify the claim as correct, incorrect, or misleading and provide the logical explanation along with the sources you used.",
"content": "I want you to act as a fact checker. You will be given a statement along with relevant search results and you are supposed to provide a fact check based on the search results. You need to classify the claim as 'correct', 'incorrect', or 'misleading' and provide the logical explanation along with the sources you used.",
},
{
"role": "user",
Expand All @@ -116,7 +120,6 @@ async def fact_check(oai_client: openai.AsyncOpenAI, groq_client: AsyncGroq, dat


async def fact_check_process(
oai_client: openai.AsyncOpenAI,
groq_client: AsyncGroq,
text_data: TextInputData,
mongo_client: AsyncMongoClient[_DocumentType],
Expand All @@ -143,12 +146,11 @@ async def fact_check_process(
if fact_check_ is not None:
return (fact_check_, True)

fact_check_resp = await fact_check(oai_client, groq_client, text_data)
fact_check_resp = await fact_check(groq_client, text_data)

# assign to right variable
fact_check_obj = FactCheckResponse(
url=text_data.url,
dataType=dtype,
label=fact_check_resp.label,
response=fact_check_resp.explanation,
summary=text_data.content,
Expand Down
18 changes: 10 additions & 8 deletions backend/core/postprocessors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pandas as pd
import requests

from pydantic import AnyHttpUrl

import requests
from waybackpy import WaybackMachineSaveAPI
from waybackpy.exceptions import MaximumSaveRetriesExceeded

Expand Down Expand Up @@ -31,13 +33,13 @@ def is_safe(url: AnyHttpUrl) -> bool:
def archive_url(url: AnyHttpUrl) -> str | None:
"""returns the archive url of given url"""

user_agent = "Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405"
save_api = WaybackMachineSaveAPI(
url=str(url),
user_agent=user_agent,
max_tries=3,
)
try:
user_agent = "Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405"
save_api = WaybackMachineSaveAPI(
url=str(url),
user_agent=user_agent,
max_tries=3,
)
return save_api.save()
except (MaximumSaveRetriesExceeded, requests.exceptions.RetryError):
return None
return str(url)
12 changes: 0 additions & 12 deletions backend/core/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from io import BytesIO

import instructor
import requests
from deep_translator.google import GoogleTranslator # type: ignore
from groq import AsyncGroq
from PIL import Image
from PIL.ImageFile import ImageFile
from pydantic import BaseModel, Field


Expand Down Expand Up @@ -48,10 +43,3 @@ async def summarize(client: AsyncGroq, text: str) -> str:
return response.summary
except AssertionError:
return text


def get_image(image_url: str) -> ImageFile:
"""fetches an image from a url and returns a PIL ImageFile."""
response = requests.get(image_url, allow_redirects=True, timeout=15)
response.raise_for_status()
return Image.open(BytesIO(response.content))
42 changes: 7 additions & 35 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,14 @@
from contextlib import asynccontextmanager

import logfire
import pytesseract # type: ignore
import requests
from dotenv import load_dotenv
from fastapi import BackgroundTasks, FastAPI
from fastapi.middleware.cors import CORSMiddleware
from groq import AsyncGroq
from openai import AsyncOpenAI
from pymongo import AsyncMongoClient

from core import add_to_db, db_is_working, fact_check_process, get_image, summarize, to_english
from schemas import FactCheckResponse, HealthResponse, ImageInputData, TextInputData
from core import add_to_db, db_is_working, fact_check_process, summarize, to_english
from schemas import FactCheckResponse, HealthResponse, TextInputData

# Load environment variables
load_dotenv()
Expand All @@ -22,22 +19,21 @@
# Global variables
ENV = os.environ.get("ENV", "dev")
DEBUG = ENV == "dev"
URI = os.environ.get("MONGO_URI", "mongodb://localhost:27017")
URI = "mongodb://localhost:27017"


oai_client = AsyncOpenAI()
groq_client = AsyncGroq()
mongo_client = AsyncMongoClient(URI) # type: ignore


@asynccontextmanager
async def lifespan(app: FastAPI):
"""Manages the lifespan of the FastAPI app."""
print("Lifespan starting...")
print(f"Lifespan starting for {app.title}...")
await mongo_client.aconnect()
print("Lifespan started")
yield
print("Lifespan ending...")
print(f"Lifespan ending for {app.title}...")
await mongo_client.aclose()
print("Lifespan ended")

Expand All @@ -54,7 +50,7 @@ async def lifespan(app: FastAPI):
# FastAPI CORS
app.add_middleware(
CORSMiddleware,
allow_methods=["GET", "POST"],
allow_methods=["*"],
allow_origins=["*"],
allow_headers=["*"],
)
Expand All @@ -77,31 +73,7 @@ async def verify_news(data: TextInputData, background_tasks: BackgroundTasks) ->
"""Endpoint to verify a news article."""

data.content = await summarize(groq_client, to_english(data.content))
fact_check, is_present_in_db = await fact_check_process(oai_client, groq_client, data, mongo_client, "text") # type: ignore
if not is_present_in_db:
background_tasks.add_task(add_to_db, mongo_client, fact_check) # type: ignore
return fact_check


@app.post("/verify/image/")
async def image_check(data: ImageInputData, background_tasks: BackgroundTasks) -> FactCheckResponse:
"""Endpoint to check if an image is fake."""

pytesseract.pytesseract.tesseract_cmd = os.environ.get("TESSERACT_PATH")

pic_url_str = str(data.url)
response = requests.get(pic_url_str, allow_redirects=True, timeout=15)
response.raise_for_status()
image = get_image(pic_url_str)
res = pytesseract.image_to_string(image) # type: ignore
assert isinstance(res, (str, bytes))
text = res if isinstance(res, str) else res.decode("utf-8")
text_data = TextInputData(
url=data.url,
content=text,
)

fact_check, is_present_in_db = await fact_check_process(oai_client, groq_client, text_data, mongo_client, "image") # type: ignore
fact_check, is_present_in_db = await fact_check_process(groq_client, data, mongo_client, "text") # type: ignore
if not is_present_in_db:
background_tasks.add_task(add_to_db, mongo_client, fact_check) # type: ignore
return fact_check
Loading

0 comments on commit 9e415b7

Please sign in to comment.