first commit

2024-04-17 00:21:46 +02:00 · 2024-04-17 00:21:46 +02:00 · 356f72fedc
commit 356f72fedc
24 changed files with 367 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/ai_pdf.iml
+++ b/.idea/ai_pdf.iml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,21 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="matplotlib" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="W605" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ai_pdf)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/ai_pdf.iml" filepath="$PROJECT_DIR$/.idea/ai_pdf.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/backend/inference.py
+++ b/backend/inference.py
@ -0,0 +1,54 @@
+from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores.chroma import Chroma
+from openai import OpenAI
+from backend.vector_db_manager import VectorDbManager
+from typing import Optional, Iterator, Dict
+from pathlib import Path
+
+# point to the local server, I personally use LM Studio to run local LLMs
+# You can change this to any other OpenAI API endpoint, local or not
+client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
+
+
+class InferenceInstance:
+    def __init__(self, vector_db_manager: VectorDbManager, nb_chunks_retrieved: int = 4):
+        self.vector_db_manager = vector_db_manager
+        self.history = []
+        self.nb_chunks_retrieved = nb_chunks_retrieved
+
+    def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
+        new_assistant_message = {"role": "assistant", "content": ""}
+        search_results = self._get_search_results(input_user, doc_name)
+        print(f"search results: {search_results}")
+        pages = self._update_history(input_user, search_results)
+        pages_info = f"pages used : p" + " p".join(pages)
+        print(f"history: {self.history}")
+        completion = self._get_completion()
+
+        for chunk in completion:
+            new_assistant_message["content"] += chunk.choices[0].delta.content
+            yield pages_info + " " + new_assistant_message["content"]
+
+    def _get_search_results(self, input_user: str, doc_name: str):
+        print(f"input_user: {input_user}")
+        vector_db = self.vector_db_manager.get_chroma(doc_name)
+        return vector_db.similarity_search(input_user, k=4)
+
+    def _update_history(self, input_user: str, search_results):
+        some_context = ""
+        pages = []
+        for result in search_results:
+            pages.append(str(result.metadata['page']))
+            some_context += result.page_content + "\n\n"
+        self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
+        self.history.append({"role": "user", "content": input_user})
+        return pages
+
+    def _get_completion(self):
+        return client.chat.completions.create(
+            model="local-model",
+            messages=self.history,
+            temperature=0.7,
+            stream=True,
+        )
+
--- a/backend/vector_db_manager.py
+++ b/backend/vector_db_manager.py
@ -0,0 +1,88 @@
+from langchain_core.embeddings import Embeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
+from langchain_community.document_loaders import PyPDFLoader
+from pathlib import Path
+import os
+
+
+class VectorDbManager:
+    def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
+        self.embedding_function = embedding_function
+        self.embedding_name = embedding_name
+        self.db_directory = db_directory
+        self.chunk_size = chunk_size
+
+
+    def create_vector_store_from_pdf(self, pdf_path):
+        """
+        create a chroma vector store from a pdf file path
+        store the vector store in the db_directory/pdf_name
+        where pdf_name is the name of the pdf file
+
+        :param pdf_path:
+        :return:
+        """
+        pdf_path = Path(pdf_path)
+        pdf_name = pdf_path.name
+        vector_directory = self.db_directory/self.embedding_name/pdf_name
+
+        if os.path.isdir(vector_directory):
+            print(f"{vector_directory} found, not recreating a vector store")
+            return 0
+
+        print(f"creating vector store for {vector_directory}")
+        file = PyPDFLoader(pdf_path)
+
+        docs = []
+        pages = file.load_and_split()
+        for j, page in enumerate(pages):
+            docs.append(page)
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=64,
+            length_function=len,
+            is_separator_regex=False,
+        )
+        docs = text_splitter.split_documents(docs)
+
+        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
+        print("vector store created")
+        print(vectorstore)
+
+    def create_vector_store_from_latex(self, latex_path: Path):
+        """
+        create a chroma vector store from a latex file path
+        store the vector store in the db_directory/doc_name
+        where doc_name is the name of the latex file
+
+        :param latex_path:
+        :return:
+        """
+        doc_name = latex_path.name
+        vector_directory = self.db_directory/self.embedding_name/doc_name
+
+        if os.path.isdir(vector_directory):
+            print(f"{vector_directory} found, not recreating a vector store")
+            return 0
+
+        print(f"creating vector store for {vector_directory}")
+
+        with open(latex_path, mode="r") as file:
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
+            docs = text_splitter.split_document(file.read())
+
+        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
+
+    def get_chroma(self, doc_name):
+        """
+        get the chroma vector store for a given document name
+
+        :param doc_name:
+        :return:
+        """
+        vector_directory = self.db_directory/self.embedding_name/doc_name
+        return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)
+
--- a/documents/pdfs/basic-laws-book-2016.pdf
+++ b/documents/pdfs/basic-laws-book-2016.pdf
--- a/documents/pdfs/corr_exam.pdf
+++ b/documents/pdfs/corr_exam.pdf
--- a/documents/pdfs/integration.pdf
+++ b/documents/pdfs/integration.pdf
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/data_level0.bin
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/data_level0.bin
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/header.bin
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/header.bin
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/index_metadata.pickle
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/index_metadata.pickle
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/length.bin
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/length.bin
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/link_lists.bin
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/link_lists.bin
--- a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/chroma.sqlite3
+++ b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/chroma.sqlite3
--- a/front_end/corr_exam.pdf
+++ b/front_end/corr_exam.pdf
--- a/File/bc51931986aa4ff95e72/integration.pdf
+++ b/File/bc51931986aa4ff95e72/integration.pdf
--- a/front_end/flagged/log.csv
+++ b/front_end/flagged/log.csv
@ -0,0 +1,9 @@
+PDF File,output,flag,username,timestamp
+flagged\PDF File\bc51931986aa4ff95e72\integration.pdf,"'
+    <iframe
+      src=""http://localhost:8000/pdfs/integration.pdf""
+      :width=""1000px""
+      :height=""2000px""
+      frameborder=""0""
+    ></iframe>
+    ",,,2024-04-15 02:34:48.167890
--- a/front_end/main.py
+++ b/front_end/main.py
@ -0,0 +1,134 @@
+import gradio as gr
+import os
+import subprocess
+from pathlib import Path
+
+from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
+
+from backend.vector_db_manager import VectorDbManager
+from backend.inference import InferenceInstance
+import time
+
+
+def get_accessible_port():
+    from socket import socket
+
+    with socket() as s:
+        s.bind(('', 0))
+        return int(s.getsockname()[1])
+
+
+port = get_accessible_port()
+
+
+# Launch a simple HTTP server to serve the PDF files
+
+
+def start_server():
+    command = ['python', '-m', 'http.server', f"{port}"]
+    # Set the working directory to the documents folder to serve the PDF files
+    os.chdir(Path(os.getcwd()).parent / "documents")
+    subprocess.Popen(command)
+    # Return to the original working directory
+    os.chdir(Path(os.getcwd()).parent / "front_end")
+
+
+# Start the server
+start_server()
+
+# Create VectorDbManager and Inference instance
+
+embedding_func = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={'device': 'cuda'})
+base_db_directory = Path(r"../documents/vector_db")
+vector_db_manager = VectorDbManager(embedding_name="multilingual-e5-large", embedding_function=embedding_func, chunk_size=512, db_directory=base_db_directory)
+inference_instance = InferenceInstance(vector_db_manager=vector_db_manager, nb_chunks_retrieved=4)
+
+
+user_message_global = ""
+
+
+def user(user_message, history):
+    global user_message_global
+    user_message_global = user_message
+    return "", history + [[user_message, None]]
+
+
+def bot(history):
+    global user_message_global, doc_path
+
+    if doc_path != "":
+        print("FOUND DOC_PATH")
+        vector_db_manager.create_vector_store_from_pdf(doc_path)
+    else:
+        print("NOT FOUND DOC_PATH")
+
+    bot_message = inference_instance.get_next_token(user_message_global, doc_path.split("\\")[-1])
+    history[-1][1] = ""
+    for message in bot_message:
+        history[-1][1] = message
+        time.sleep(0.05)
+        yield history
+
+
+def update_path(p):
+    """Update the global variable doc_path with the selected PDF path"""
+    global doc_path
+    doc_path = str(p)
+    print(f"Selected PDF path: {doc_path}")
+
+
+def pdf_viewer(pdf_file):
+    """Display the PDF file in an HTML viewer"""
+    pdf_path = Path(pdf_file)
+    pdf_working_dir = Path(os.getcwd()).parent / "documents" / "pdfs"
+
+    # Check if the PDF file is in the working directory
+    if not (pdf_working_dir / pdf_path.name).exists():
+        return f"""<h1>File {pdf_path.name} not found in the working directory</h1>
+                   <p>You can only access PDFs that are inside {pdf_working_dir}</p>"""
+
+    # Create the HTML code for the PDF viewer
+    return f"""
+    <iframe
+        src="http://localhost:{port}/pdfs/{pdf_path.name}"
+        width="100%"
+        height="800px"
+        style="border:none;"
+    ></iframe>
+    """
+
+
+# Define main Gradio tab
+with gr.Blocks() as main_tab:
+    with gr.Column():
+        with gr.Row():
+            with gr.Column(scale=12):
+                pdf_output = gr.HTML()
+        with gr.Row():
+            with gr.Column(scale=12):
+                file_input = gr.File(label="Select a PDF file")
+
+    with gr.Column():
+        with gr.Group():
+            chatbot = gr.Chatbot(scale=2)
+            msg = gr.Textbox(scale=2)
+            msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+                bot, chatbot, chatbot
+            )
+
+    file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output)
+    file_input.upload(update_path, inputs=file_input)
+
+
+# Define options tab
+with gr.Blocks() as options_tab:
+    with gr.Column():
+        with gr.Row():
+            with gr.Column(scale=12):
+                # TODO: Add options for the inference instance
+                gr.Textbox(label="Options", scale=2)
+
+
+app = gr.TabbedInterface([main_tab, options_tab], ["Main", "Options"])
+app.queue()
+app.launch()
--- a/front_end/test.html
+++ b/front_end/test.html
@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html>
+ <head>
+  <title>PDF Viewer</title>
+ </head>
+ <body>
+
+  <!--
+   Place the following <div> element where you want the PDF to be displayed in your website. You can change the size using the width and height attributes.
+  -->
+  <div>
+
+    <iframe
+     src="C:\Users\CLEME\Pictures\corr_exam.pdf"
+     width="500"
+     height="678"
+    >
+    </iframe>
+  </div>
+
+ </body>
--- a/front_end/test.py
+++ b/front_end/test.py
@ -0,0 +1,3 @@
+import os
+print("---")
+print(os.getcwd())