first commit

2024-04-17 00:21:46 +02:00 · 2024-04-17 00:21:46 +02:00 · 356f72fedc
commit 356f72fedc
24 changed files with 367 additions and 0 deletions
--- a/backend/inference.py
+++ b/backend/inference.py
@ -0,0 +1,54 @@
+from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores.chroma import Chroma
+from openai import OpenAI
+from backend.vector_db_manager import VectorDbManager
+from typing import Optional, Iterator, Dict
+from pathlib import Path
+
+# point to the local server, I personally use LM Studio to run local LLMs
+# You can change this to any other OpenAI API endpoint, local or not
+client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
+
+
+class InferenceInstance:
+    def __init__(self, vector_db_manager: VectorDbManager, nb_chunks_retrieved: int = 4):
+        self.vector_db_manager = vector_db_manager
+        self.history = []
+        self.nb_chunks_retrieved = nb_chunks_retrieved
+
+    def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
+        new_assistant_message = {"role": "assistant", "content": ""}
+        search_results = self._get_search_results(input_user, doc_name)
+        print(f"search results: {search_results}")
+        pages = self._update_history(input_user, search_results)
+        pages_info = f"pages used : p" + " p".join(pages)
+        print(f"history: {self.history}")
+        completion = self._get_completion()
+
+        for chunk in completion:
+            new_assistant_message["content"] += chunk.choices[0].delta.content
+            yield pages_info + " " + new_assistant_message["content"]
+
+    def _get_search_results(self, input_user: str, doc_name: str):
+        print(f"input_user: {input_user}")
+        vector_db = self.vector_db_manager.get_chroma(doc_name)
+        return vector_db.similarity_search(input_user, k=4)
+
+    def _update_history(self, input_user: str, search_results):
+        some_context = ""
+        pages = []
+        for result in search_results:
+            pages.append(str(result.metadata['page']))
+            some_context += result.page_content + "\n\n"
+        self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
+        self.history.append({"role": "user", "content": input_user})
+        return pages
+
+    def _get_completion(self):
+        return client.chat.completions.create(
+            model="local-model",
+            messages=self.history,
+            temperature=0.7,
+            stream=True,
+        )
+
--- a/backend/vector_db_manager.py
+++ b/backend/vector_db_manager.py
@ -0,0 +1,88 @@
+from langchain_core.embeddings import Embeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
+from langchain_community.document_loaders import PyPDFLoader
+from pathlib import Path
+import os
+
+
+class VectorDbManager:
+    def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
+        self.embedding_function = embedding_function
+        self.embedding_name = embedding_name
+        self.db_directory = db_directory
+        self.chunk_size = chunk_size
+
+
+    def create_vector_store_from_pdf(self, pdf_path):
+        """
+        create a chroma vector store from a pdf file path
+        store the vector store in the db_directory/pdf_name
+        where pdf_name is the name of the pdf file
+
+        :param pdf_path:
+        :return:
+        """
+        pdf_path = Path(pdf_path)
+        pdf_name = pdf_path.name
+        vector_directory = self.db_directory/self.embedding_name/pdf_name
+
+        if os.path.isdir(vector_directory):
+            print(f"{vector_directory} found, not recreating a vector store")
+            return 0
+
+        print(f"creating vector store for {vector_directory}")
+        file = PyPDFLoader(pdf_path)
+
+        docs = []
+        pages = file.load_and_split()
+        for j, page in enumerate(pages):
+            docs.append(page)
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=64,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        docs = text_splitter.split_documents(docs)
+
+        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
+        print("vector store created")
+        print(vectorstore)
+
+    def create_vector_store_from_latex(self, latex_path: Path):
+        """
+        create a chroma vector store from a latex file path
+        store the vector store in the db_directory/doc_name
+        where doc_name is the name of the latex file
+
+        :param latex_path:
+        :return:
+        """
+        doc_name = latex_path.name
+        vector_directory = self.db_directory/self.embedding_name/doc_name
+
+        if os.path.isdir(vector_directory):
+            print(f"{vector_directory} found, not recreating a vector store")
+            return 0
+
+        print(f"creating vector store for {vector_directory}")
+
+        with open(latex_path, mode="r") as file:
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
+            docs = text_splitter.split_document(file.read())
+
+        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
+
+    def get_chroma(self, doc_name):
+        """
+        get the chroma vector store for a given document name
+
+        :param doc_name:
+        :return:
+        """
+        vector_directory = self.db_directory/self.embedding_name/doc_name
+        return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)
+