097-NonePy · lakshith-403 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+__pycache__
+vectore_stores
+sajith_vectorstore
diff --git a/RAG/agents/answer_grader.py b/RAG/agents/answer_grader.py
@@ -0,0 +1,32 @@
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+import os
+os.environ["MISTRAL_API_KEY"] = "naAG2SIBHoKW5KtKS7B2MN5z49roSnzV"
+
+# Data model
+class GradeAnswer(BaseModel):
+    """Binary score to assess answer addresses question."""
+
+    binary_score: str = Field(
+        description="Answer addresses the question, 'yes' or 'no'"
+    )
+
+
+# LLM with function call
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+structured_llm_grader = llm.with_structured_output(GradeAnswer)
+
+# Prompt
+system = """You are a grader assessing whether an answer addresses / resolves a question \n 
+     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
+answer_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system),
+        ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
+    ]
+)
+
+answer_grader = answer_prompt | structured_llm_grader
diff --git a/RAG/agents/contextualize.py b/RAG/agents/contextualize.py
@@ -0,0 +1,43 @@
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+from langchain_core.output_parsers import StrOutputParser
+from dotenv import load_dotenv
+import os
+
+class ContextualizeQuestion(BaseModel):
+  """Contextualize the question."""
+
+  contextualized_question: str = Field(
+      ...,
+      description="The contextualized question.",
+  )
+
+contextualize_q_system_prompt = (
+    "Given a chat history and the latest user question, "
+    "which might reference context in the chat history, "
+    "formulate a standalone question that can be understood "
+    "without the chat history. Specifically:"
+    "\n1. Replace pronouns (e.g., 'he', 'she', 'it', 'they') with their specific referents."
+    "\n2. Expand references like 'that', 'this', 'those' to what they specifically refer to."
+    "\n3. Include any relevant context from previous messages that's necessary to understand the question."
+    "\n4. Ensure the reformulated question is clear, specific, and self-contained."
+    "\nDo NOT answer the question, just reformulate it to be self-explanatory."
+)
+
+load_dotenv()
+mistral_api_key = os.getenv("MISTRAL_API_KEY")
+if not mistral_api_key:
+    raise ValueError("MISTRAL_API_KEY environment variable not set")
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+
+contextualize_q_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", contextualize_q_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ]
+)
+structured_llm_router = llm.with_structured_output(ContextualizeQuestion)
+
+contextualizer = contextualize_q_prompt | structured_llm_router
diff --git a/RAG/agents/extractor.py b/RAG/agents/extractor.py
@@ -0,0 +1,66 @@
+### Router
+
+from typing import Literal
+
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+from dotenv import load_dotenv
+import os
+# Data model
+class ExtractQuery(BaseModel):
+    """Route a user query to the relevant datasources with subquestions."""
+
+    namal_vector_search_query: str = Field(
+        ...,
+        description="The query to search the vector store of namal.",
+    )
+    ranil_vector_search_query: str = Field(
+        ...,
+        description="The query to search the vector store of ranil.",
+    )
+    sajith_vector_search_query: str = Field(
+        ...,
+        description="The query to search the vector store of sajith.",
+    )
+    web_search_query: str = Field(
+        ...,
+        description="The query to search the web.",
+    )
+
+load_dotenv()
+mistral_api_key = os.getenv("MISTRAL_API_KEY")
+
+if not mistral_api_key:
+    raise ValueError("MISTRAL_API_KEY environment variable not set")
+
+# Initialize the ChatMistralAI client with the API key
+llm = ChatMistralAI(model="mistral-large-latest", api_key=mistral_api_key)
+structured_llm_router = llm.with_structured_output(ExtractQuery)
+
+# Prompt
+system = """You are an expert at routing a user question to a vectorstore or web search.
+There are three vectorstores. One contains documents related to Manifests of political candidate Sajith Premadasa.
+Another contains documents related to Manifests of political candidate Namal Rajapaksa.
+The third contains documents related to Manifests of political candidate Ranil Wickramasinghe.
+
+for an example, what this candidate do for education sector, health sector etc is on the vectorstore.
+And also their plans for the future of the country is on the vectorstore.
+
+If the question involves something about a candidate's policies in a past year, then you will have to do a websearch.
+And also if you feel like a web search will be usefull. Do a web search.
+
+After deciding,
+Output the 'namal_vector_search_query': The query that needs to be searched from the vector store of namal.
+And the 'ranil_vector_search_query': The query that needs to be searched from the vector store of ranil.
+And the 'sajith_vector_search_query': The query that needs to be searched from the vector store of sajith.
+And the 'web_search_query': The query that needs to be searched from the web.
+"""
+route_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system),
+        ("human", "{question}"),
+    ]
+)
+
+question_extractor = route_prompt | structured_llm_router
diff --git a/RAG/agents/generate.py b/RAG/agents/generate.py
@@ -0,0 +1,43 @@
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+import os
+template = """You are a very vigilant and helpful journalist. Use the following pieces of 
+context to answer the question at the end.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Be concise and helpful. 
+
+________________________________________________________________________________
+Here are the web results regarding the question:
+{web_context}
+
+Here are the results from the manifesto of the candidates:
+________________________________________________________________________________
+namel rajapakse:
+{namal_context}
+
+________________________________________________________________________________
+ranil wickramasinghe:
+{ranil_context}
+
+________________________________________________________________________________
+sajith premadasa:
+{sajith_context}
+
+________________________________________________________________________________
+Question: {question}
+
+Helpful Answer:"""
+custom_rag_prompt = PromptTemplate.from_template(template)
+
+# LLM
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+
+# Post-processing
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+
+
+rag_chain = custom_rag_prompt | llm | StrOutputParser()
diff --git a/RAG/agents/grader.py b/RAG/agents/grader.py
@@ -0,0 +1,29 @@
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+import os
+
+class GradeDocuments(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+
+
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+structured_llm_grader = llm.with_structured_output(GradeDocuments)
+
+# Prompt
+system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
+    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
+    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
+    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+grade_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system),
+        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
+    ]
+)
+
+retrieval_grader = grade_prompt | structured_llm_grader
diff --git a/RAG/agents/hallucinate_checker.py b/RAG/agents/hallucinate_checker.py
@@ -0,0 +1,28 @@
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+class GradeHallucinations(BaseModel):
+    """Binary score for hallucination present in generation answer."""
+
+    binary_score: str = Field(
+        description="Answer is grounded in the facts, 'yes' or 'no'"
+    )
+
+import os
+# LLM with function call
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+structured_llm_grader = llm.with_structured_output(GradeHallucinations)
+
+# Prompt
+system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
+     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
+hallucination_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system),
+        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
+    ]
+)
+
+hallucination_grader = hallucination_prompt | structured_llm_grader
diff --git a/RAG/agents/question_rewriter.py b/RAG/agents/question_rewriter.py
@@ -0,0 +1,22 @@
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel  , Field
+from langchain_mistralai import ChatMistralAI
+import os
+llm = ChatMistralAI(model="mistral-large-latest", api_key=os.getenv("MISTRAL_API_KEY"))
+
+# Prompt
+system = """You a question re-writer that converts an input question to a better version that is optimized \n 
+     for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+re_write_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system),
+        (
+            "human",
+            "Here is the initial question: \n\n {question} \n Formulate an improved question.",
+        ),
+    ]
+)
+
+question_rewriter = re_write_prompt | llm | StrOutputParser()
diff --git a/RAG/edges/decide_to_generate.py b/RAG/edges/decide_to_generate.py
@@ -0,0 +1,26 @@
+def decide_to_generate(state):
+    """
+    Determines whether to generate an answer, or re-generate a question.
+
+    Args:
+        state (dict): The current graph state
+
+    Returns:
+        str: Binary decision for next node to call
+    """
+
+    print("---ASSESS GRADED DOCUMENTS---")
+    state["question"]
+    filtered_documents = state["documents"]
+
+    if not filtered_documents:
+        # All documents have been filtered check_relevance
+        # We will re-generate a new query
+        print(
+            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
+        )
+        return "transform_query"
+    else:
+        # We have relevant documents, so generate answer
+        print("---DECISION: GENERATE---")
+        return "generate"
diff --git a/RAG/edges/generation_grader.py b/RAG/edges/generation_grader.py
@@ -0,0 +1,44 @@
+from RAG.agents.answer_grader import answer_grader
+from RAG.agents.hallucinate_checker import  hallucination_grader
+
+def grade_generation_v_documents_and_question(state):
+    """
+    Determines whether the generation is grounded in the document and answers question.
+
+    Args:
+        state (dict): The current graph state
+
+    Returns:
+        str: Decision for next node to call
+    """
+
+    print("---CHECK HALLUCINATIONS---")
+    question = state["question"]
+    documents = state["namal_vector_search_documents"] + state["ranil_vector_search_documents"] + state["sajith_vector_search_documents"] + state["web_search_documents"]
+    generation = state["generation"]
+
+    score = hallucination_grader.invoke(
+        {"documents": documents, "generation": generation}
+    )
+    grade = score.binary_score
+
+    if state.get("generated_count", 0) > 1:
+        print("---DECISION: AlREADY TRIED AND FAILED. CONTINUING ANYWAYS---")
+        return "useful"
+
+    # Check hallucination
+    if grade == "yes":
+        print("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
+        # Check question-answering
+        print("---GRADE GENERATION vs QUESTION---")
+        score = answer_grader.invoke({"question": question, "generation": generation})
+        grade = score.binary_score
+        if grade == "yes":
+            print("---DECISION: GENERATION ADDRESSES QUESTION---")
+            return "useful"
+        else:
+            print("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
+            return "not useful"
+    else:
+        print("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
+        return "not supported"