first commit

This commit is contained in:
Crizomb 2024-04-17 00:21:46 +02:00
commit 356f72fedc
24 changed files with 367 additions and 0 deletions

3
.idea/.gitignore generated vendored Normal file
View file

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

10
.idea/ai_pdf.iml generated Normal file
View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View file

@ -0,0 +1,21 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="matplotlib" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="W605" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View file

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml generated Normal file
View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ai_pdf)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ai_pdf.iml" filepath="$PROJECT_DIR$/.idea/ai_pdf.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

54
backend/inference.py Normal file
View file

@ -0,0 +1,54 @@
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from openai import OpenAI
from backend.vector_db_manager import VectorDbManager
from typing import Optional, Iterator, Dict
from pathlib import Path
# point to the local server, I personally use LM Studio to run local LLMs
# You can change this to any other OpenAI API endpoint, local or not
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
class InferenceInstance:
def __init__(self, vector_db_manager: VectorDbManager, nb_chunks_retrieved: int = 4):
self.vector_db_manager = vector_db_manager
self.history = []
self.nb_chunks_retrieved = nb_chunks_retrieved
def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
new_assistant_message = {"role": "assistant", "content": ""}
search_results = self._get_search_results(input_user, doc_name)
print(f"search results: {search_results}")
pages = self._update_history(input_user, search_results)
pages_info = f"pages used : p" + " p".join(pages)
print(f"history: {self.history}")
completion = self._get_completion()
for chunk in completion:
new_assistant_message["content"] += chunk.choices[0].delta.content
yield pages_info + " " + new_assistant_message["content"]
def _get_search_results(self, input_user: str, doc_name: str):
print(f"input_user: {input_user}")
vector_db = self.vector_db_manager.get_chroma(doc_name)
return vector_db.similarity_search(input_user, k=4)
def _update_history(self, input_user: str, search_results):
some_context = ""
pages = []
for result in search_results:
pages.append(str(result.metadata['page']))
some_context += result.page_content + "\n\n"
self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
self.history.append({"role": "user", "content": input_user})
return pages
def _get_completion(self):
return client.chat.completions.create(
model="local-model",
messages=self.history,
temperature=0.7,
stream=True,
)

View file

@ -0,0 +1,88 @@
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
import os
class VectorDbManager:
def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
self.embedding_function = embedding_function
self.embedding_name = embedding_name
self.db_directory = db_directory
self.chunk_size = chunk_size
def create_vector_store_from_pdf(self, pdf_path):
"""
create a chroma vector store from a pdf file path
store the vector store in the db_directory/pdf_name
where pdf_name is the name of the pdf file
:param pdf_path:
:return:
"""
pdf_path = Path(pdf_path)
pdf_name = pdf_path.name
vector_directory = self.db_directory/self.embedding_name/pdf_name
if os.path.isdir(vector_directory):
print(f"{vector_directory} found, not recreating a vector store")
return 0
print(f"creating vector store for {vector_directory}")
file = PyPDFLoader(pdf_path)
docs = []
pages = file.load_and_split()
for j, page in enumerate(pages):
docs.append(page)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=64,
length_function=len,
is_separator_regex=False,
)
docs = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
print("vector store created")
print(vectorstore)
def create_vector_store_from_latex(self, latex_path: Path):
"""
create a chroma vector store from a latex file path
store the vector store in the db_directory/doc_name
where doc_name is the name of the latex file
:param latex_path:
:return:
"""
doc_name = latex_path.name
vector_directory = self.db_directory/self.embedding_name/doc_name
if os.path.isdir(vector_directory):
print(f"{vector_directory} found, not recreating a vector store")
return 0
print(f"creating vector store for {vector_directory}")
with open(latex_path, mode="r") as file:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
docs = text_splitter.split_document(file.read())
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
def get_chroma(self, doc_name):
"""
get the chroma vector store for a given document name
:param doc_name:
:return:
"""
vector_directory = self.db_directory/self.embedding_name/doc_name
return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
front_end/corr_exam.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,9 @@
PDF File,output,flag,username,timestamp
flagged\PDF File\bc51931986aa4ff95e72\integration.pdf,"'
<iframe
src=""http://localhost:8000/pdfs/integration.pdf""
:width=""1000px""
:height=""2000px""
frameborder=""0""
></iframe>
",,,2024-04-15 02:34:48.167890
1 PDF File output flag username timestamp
2 flagged\PDF File\bc51931986aa4ff95e72\integration.pdf ' <iframe src="http://localhost:8000/pdfs/integration.pdf" :width="1000px" :height="2000px" frameborder="0" ></iframe> 2024-04-15 02:34:48.167890

134
front_end/main.py Normal file
View file

@ -0,0 +1,134 @@
import gradio as gr
import os
import subprocess
from pathlib import Path
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from backend.vector_db_manager import VectorDbManager
from backend.inference import InferenceInstance
import time
def get_accessible_port():
from socket import socket
with socket() as s:
s.bind(('', 0))
return int(s.getsockname()[1])
port = get_accessible_port()
# Launch a simple HTTP server to serve the PDF files
def start_server():
command = ['python', '-m', 'http.server', f"{port}"]
# Set the working directory to the documents folder to serve the PDF files
os.chdir(Path(os.getcwd()).parent / "documents")
subprocess.Popen(command)
# Return to the original working directory
os.chdir(Path(os.getcwd()).parent / "front_end")
# Start the server
start_server()
# Create VectorDbManager and Inference instance
embedding_func = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={'device': 'cuda'})
base_db_directory = Path(r"../documents/vector_db")
vector_db_manager = VectorDbManager(embedding_name="multilingual-e5-large", embedding_function=embedding_func, chunk_size=512, db_directory=base_db_directory)
inference_instance = InferenceInstance(vector_db_manager=vector_db_manager, nb_chunks_retrieved=4)
user_message_global = ""
def user(user_message, history):
global user_message_global
user_message_global = user_message
return "", history + [[user_message, None]]
def bot(history):
global user_message_global, doc_path
if doc_path != "":
print("FOUND DOC_PATH")
vector_db_manager.create_vector_store_from_pdf(doc_path)
else:
print("NOT FOUND DOC_PATH")
bot_message = inference_instance.get_next_token(user_message_global, doc_path.split("\\")[-1])
history[-1][1] = ""
for message in bot_message:
history[-1][1] = message
time.sleep(0.05)
yield history
def update_path(p):
"""Update the global variable doc_path with the selected PDF path"""
global doc_path
doc_path = str(p)
print(f"Selected PDF path: {doc_path}")
def pdf_viewer(pdf_file):
"""Display the PDF file in an HTML viewer"""
pdf_path = Path(pdf_file)
pdf_working_dir = Path(os.getcwd()).parent / "documents" / "pdfs"
# Check if the PDF file is in the working directory
if not (pdf_working_dir / pdf_path.name).exists():
return f"""<h1>File {pdf_path.name} not found in the working directory</h1>
<p>You can only access PDFs that are inside {pdf_working_dir}</p>"""
# Create the HTML code for the PDF viewer
return f"""
<iframe
src="http://localhost:{port}/pdfs/{pdf_path.name}"
width="100%"
height="800px"
style="border:none;"
></iframe>
"""
# Define main Gradio tab
with gr.Blocks() as main_tab:
with gr.Column():
with gr.Row():
with gr.Column(scale=12):
pdf_output = gr.HTML()
with gr.Row():
with gr.Column(scale=12):
file_input = gr.File(label="Select a PDF file")
with gr.Column():
with gr.Group():
chatbot = gr.Chatbot(scale=2)
msg = gr.Textbox(scale=2)
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, chatbot, chatbot
)
file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output)
file_input.upload(update_path, inputs=file_input)
# Define options tab
with gr.Blocks() as options_tab:
with gr.Column():
with gr.Row():
with gr.Column(scale=12):
# TODO: Add options for the inference instance
gr.Textbox(label="Options", scale=2)
app = gr.TabbedInterface([main_tab, options_tab], ["Main", "Options"])
app.queue()
app.launch()

21
front_end/test.html Normal file
View file

@ -0,0 +1,21 @@
<!DOCTYPE html>
<html>
<head>
<title>PDF Viewer</title>
</head>
<body>
<!--
Place the following <div> element where you want the PDF to be displayed in your website. You can change the size using the width and height attributes.
-->
<div>
<iframe
src="C:\Users\CLEME\Pictures\corr_exam.pdf"
width="500"
height="678"
>
</iframe>
</div>
</body>

3
front_end/test.py Normal file
View file

@ -0,0 +1,3 @@
import os
print("---")
print(os.getcwd())