commit 356f72fedc7f59725edc9bf09b411b71924e313e Author: Crizomb Date: Wed Apr 17 00:21:46 2024 +0200 first commit diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/ai_pdf.iml b/.idea/ai_pdf.iml new file mode 100644 index 0000000..74d515a --- /dev/null +++ b/.idea/ai_pdf.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..0e94d02 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,21 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d6f823e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c5757b1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/backend/inference.py b/backend/inference.py new file mode 100644 index 0000000..de302e6 --- /dev/null +++ b/backend/inference.py @@ -0,0 +1,54 @@ +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings +from langchain_community.vectorstores.chroma import Chroma +from openai import OpenAI +from backend.vector_db_manager import VectorDbManager +from typing import Optional, Iterator, Dict +from pathlib import Path + +# point to the local server, I personally use LM Studio to run local LLMs +# You can change this to any other OpenAI API endpoint, local or not +client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") + + +class InferenceInstance: + def __init__(self, vector_db_manager: VectorDbManager, nb_chunks_retrieved: int = 4): + self.vector_db_manager = vector_db_manager + self.history = [] + self.nb_chunks_retrieved = nb_chunks_retrieved + + def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]: + new_assistant_message = {"role": "assistant", "content": ""} + search_results = self._get_search_results(input_user, doc_name) + print(f"search results: {search_results}") + pages = self._update_history(input_user, search_results) + pages_info = f"pages used : p" + " p".join(pages) + print(f"history: {self.history}") + completion = self._get_completion() + + for chunk in completion: + new_assistant_message["content"] += chunk.choices[0].delta.content + yield pages_info + " " + new_assistant_message["content"] + + def _get_search_results(self, input_user: str, doc_name: str): + print(f"input_user: {input_user}") + vector_db = self.vector_db_manager.get_chroma(doc_name) + return vector_db.similarity_search(input_user, k=4) + + def _update_history(self, input_user: str, search_results): + some_context = "" + pages = [] + for result in search_results: + pages.append(str(result.metadata['page'])) + some_context += result.page_content + "\n\n" + self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"}) + self.history.append({"role": "user", "content": input_user}) + return pages + + def _get_completion(self): + return client.chat.completions.create( + model="local-model", + messages=self.history, + temperature=0.7, + stream=True, + ) + diff --git a/backend/vector_db_manager.py b/backend/vector_db_manager.py new file mode 100644 index 0000000..02f7e24 --- /dev/null +++ b/backend/vector_db_manager.py @@ -0,0 +1,88 @@ +from langchain_core.embeddings import Embeddings +from langchain_community.vectorstores import Chroma +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter, Language +from langchain_community.document_loaders import PyPDFLoader +from pathlib import Path +import os + + +class VectorDbManager: + def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path): + self.embedding_function = embedding_function + self.embedding_name = embedding_name + self.db_directory = db_directory + self.chunk_size = chunk_size + + + def create_vector_store_from_pdf(self, pdf_path): + """ + create a chroma vector store from a pdf file path + store the vector store in the db_directory/pdf_name + where pdf_name is the name of the pdf file + + :param pdf_path: + :return: + """ + pdf_path = Path(pdf_path) + pdf_name = pdf_path.name + vector_directory = self.db_directory/self.embedding_name/pdf_name + + if os.path.isdir(vector_directory): + print(f"{vector_directory} found, not recreating a vector store") + return 0 + + print(f"creating vector store for {vector_directory}") + file = PyPDFLoader(pdf_path) + + docs = [] + pages = file.load_and_split() + for j, page in enumerate(pages): + docs.append(page) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=64, + length_function=len, + is_separator_regex=False, + ) + docs = text_splitter.split_documents(docs) + + vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory) + print("vector store created") + print(vectorstore) + + def create_vector_store_from_latex(self, latex_path: Path): + """ + create a chroma vector store from a latex file path + store the vector store in the db_directory/doc_name + where doc_name is the name of the latex file + + :param latex_path: + :return: + """ + doc_name = latex_path.name + vector_directory = self.db_directory/self.embedding_name/doc_name + + if os.path.isdir(vector_directory): + print(f"{vector_directory} found, not recreating a vector store") + return 0 + + print(f"creating vector store for {vector_directory}") + + with open(latex_path, mode="r") as file: + text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100) + docs = text_splitter.split_document(file.read()) + + vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory) + + def get_chroma(self, doc_name): + """ + get the chroma vector store for a given document name + + :param doc_name: + :return: + """ + vector_directory = self.db_directory/self.embedding_name/doc_name + return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function) + diff --git a/documents/pdfs/basic-laws-book-2016.pdf b/documents/pdfs/basic-laws-book-2016.pdf new file mode 100644 index 0000000..2c93d98 Binary files /dev/null and b/documents/pdfs/basic-laws-book-2016.pdf differ diff --git a/documents/pdfs/corr_exam.pdf b/documents/pdfs/corr_exam.pdf new file mode 100644 index 0000000..03ac6b0 Binary files /dev/null and b/documents/pdfs/corr_exam.pdf differ diff --git a/documents/pdfs/integration.pdf b/documents/pdfs/integration.pdf new file mode 100644 index 0000000..78b07a2 Binary files /dev/null and b/documents/pdfs/integration.pdf differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/data_level0.bin b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/data_level0.bin new file mode 100644 index 0000000..5a3cfd4 Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/data_level0.bin differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/header.bin b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/header.bin new file mode 100644 index 0000000..a575974 Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/header.bin differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/index_metadata.pickle b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/index_metadata.pickle new file mode 100644 index 0000000..a38d5b9 Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/index_metadata.pickle differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/length.bin b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/length.bin new file mode 100644 index 0000000..95561d4 Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/length.bin differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/link_lists.bin b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/link_lists.bin new file mode 100644 index 0000000..7233fb3 Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/719410f6-f004-47e0-8cc8-ff36ecd820a6/link_lists.bin differ diff --git a/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/chroma.sqlite3 b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/chroma.sqlite3 new file mode 100644 index 0000000..f633b6b Binary files /dev/null and b/documents/vector_db/multilingual-e5-large/basic-laws-book-2016.pdf/chroma.sqlite3 differ diff --git a/front_end/corr_exam.pdf b/front_end/corr_exam.pdf new file mode 100644 index 0000000..03ac6b0 Binary files /dev/null and b/front_end/corr_exam.pdf differ diff --git a/front_end/flagged/PDF File/bc51931986aa4ff95e72/integration.pdf b/front_end/flagged/PDF File/bc51931986aa4ff95e72/integration.pdf new file mode 100644 index 0000000..78b07a2 Binary files /dev/null and b/front_end/flagged/PDF File/bc51931986aa4ff95e72/integration.pdf differ diff --git a/front_end/flagged/log.csv b/front_end/flagged/log.csv new file mode 100644 index 0000000..af15e5a --- /dev/null +++ b/front_end/flagged/log.csv @@ -0,0 +1,9 @@ +PDF File,output,flag,username,timestamp +flagged\PDF File\bc51931986aa4ff95e72\integration.pdf,"' + + ",,,2024-04-15 02:34:48.167890 diff --git a/front_end/main.py b/front_end/main.py new file mode 100644 index 0000000..481921e --- /dev/null +++ b/front_end/main.py @@ -0,0 +1,134 @@ +import gradio as gr +import os +import subprocess +from pathlib import Path + +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings + +from backend.vector_db_manager import VectorDbManager +from backend.inference import InferenceInstance +import time + + +def get_accessible_port(): + from socket import socket + + with socket() as s: + s.bind(('', 0)) + return int(s.getsockname()[1]) + + +port = get_accessible_port() + + +# Launch a simple HTTP server to serve the PDF files + + +def start_server(): + command = ['python', '-m', 'http.server', f"{port}"] + # Set the working directory to the documents folder to serve the PDF files + os.chdir(Path(os.getcwd()).parent / "documents") + subprocess.Popen(command) + # Return to the original working directory + os.chdir(Path(os.getcwd()).parent / "front_end") + + +# Start the server +start_server() + +# Create VectorDbManager and Inference instance + +embedding_func = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={'device': 'cuda'}) +base_db_directory = Path(r"../documents/vector_db") +vector_db_manager = VectorDbManager(embedding_name="multilingual-e5-large", embedding_function=embedding_func, chunk_size=512, db_directory=base_db_directory) +inference_instance = InferenceInstance(vector_db_manager=vector_db_manager, nb_chunks_retrieved=4) + + +user_message_global = "" + + +def user(user_message, history): + global user_message_global + user_message_global = user_message + return "", history + [[user_message, None]] + + +def bot(history): + global user_message_global, doc_path + + if doc_path != "": + print("FOUND DOC_PATH") + vector_db_manager.create_vector_store_from_pdf(doc_path) + else: + print("NOT FOUND DOC_PATH") + + bot_message = inference_instance.get_next_token(user_message_global, doc_path.split("\\")[-1]) + history[-1][1] = "" + for message in bot_message: + history[-1][1] = message + time.sleep(0.05) + yield history + + +def update_path(p): + """Update the global variable doc_path with the selected PDF path""" + global doc_path + doc_path = str(p) + print(f"Selected PDF path: {doc_path}") + + +def pdf_viewer(pdf_file): + """Display the PDF file in an HTML viewer""" + pdf_path = Path(pdf_file) + pdf_working_dir = Path(os.getcwd()).parent / "documents" / "pdfs" + + # Check if the PDF file is in the working directory + if not (pdf_working_dir / pdf_path.name).exists(): + return f"""

File {pdf_path.name} not found in the working directory

You can only access PDFs that are inside {pdf_working_dir}

""" + + # Create the HTML code for the PDF viewer + return f""" + + """ + + +# Define main Gradio tab +with gr.Blocks() as main_tab: + with gr.Column(): + with gr.Row(): + with gr.Column(scale=12): + pdf_output = gr.HTML() + with gr.Row(): + with gr.Column(scale=12): + file_input = gr.File(label="Select a PDF file") + + with gr.Column(): + with gr.Group(): + chatbot = gr.Chatbot(scale=2) + msg = gr.Textbox(scale=2) + msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( + bot, chatbot, chatbot + ) + + file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output) + file_input.upload(update_path, inputs=file_input) + + +# Define options tab +with gr.Blocks() as options_tab: + with gr.Column(): + with gr.Row(): + with gr.Column(scale=12): + # TODO: Add options for the inference instance + gr.Textbox(label="Options", scale=2) + + +app = gr.TabbedInterface([main_tab, options_tab], ["Main", "Options"]) +app.queue() +app.launch() diff --git a/front_end/test.html b/front_end/test.html new file mode 100644 index 0000000..23a9430 --- /dev/null +++ b/front_end/test.html @@ -0,0 +1,21 @@ + + + + PDF Viewer + + + + +

+ + +

+ + \ No newline at end of file diff --git a/front_end/test.py b/front_end/test.py new file mode 100644 index 0000000..34a19fe --- /dev/null +++ b/front_end/test.py @@ -0,0 +1,3 @@ +import os +print("---") +print(os.getcwd()) \ No newline at end of file