first commit
This commit is contained in:
commit
356f72fedc
24 changed files with 367 additions and 0 deletions
3
.idea/.gitignore
generated
vendored
Normal file
3
.idea/.gitignore
generated
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
10
.idea/ai_pdf.iml
generated
Normal file
10
.idea/ai_pdf.iml
generated
Normal file
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
21
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
21
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
|
@ -0,0 +1,21 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="1">
|
||||
<item index="0" class="java.lang.String" itemvalue="matplotlib" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="W605" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
4
.idea/misc.xml
generated
Normal file
4
.idea/misc.xml
generated
Normal file
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (ai_pdf)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/ai_pdf.iml" filepath="$PROJECT_DIR$/.idea/ai_pdf.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
54
backend/inference.py
Normal file
54
backend/inference.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
from langchain_community.vectorstores.chroma import Chroma
|
||||
from openai import OpenAI
|
||||
from backend.vector_db_manager import VectorDbManager
|
||||
from typing import Optional, Iterator, Dict
|
||||
from pathlib import Path
|
||||
|
||||
# point to the local server, I personally use LM Studio to run local LLMs
|
||||
# You can change this to any other OpenAI API endpoint, local or not
|
||||
client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
|
||||
|
||||
|
||||
class InferenceInstance:
|
||||
def __init__(self, vector_db_manager: VectorDbManager, nb_chunks_retrieved: int = 4):
|
||||
self.vector_db_manager = vector_db_manager
|
||||
self.history = []
|
||||
self.nb_chunks_retrieved = nb_chunks_retrieved
|
||||
|
||||
def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
|
||||
new_assistant_message = {"role": "assistant", "content": ""}
|
||||
search_results = self._get_search_results(input_user, doc_name)
|
||||
print(f"search results: {search_results}")
|
||||
pages = self._update_history(input_user, search_results)
|
||||
pages_info = f"pages used : p" + " p".join(pages)
|
||||
print(f"history: {self.history}")
|
||||
completion = self._get_completion()
|
||||
|
||||
for chunk in completion:
|
||||
new_assistant_message["content"] += chunk.choices[0].delta.content
|
||||
yield pages_info + " " + new_assistant_message["content"]
|
||||
|
||||
def _get_search_results(self, input_user: str, doc_name: str):
|
||||
print(f"input_user: {input_user}")
|
||||
vector_db = self.vector_db_manager.get_chroma(doc_name)
|
||||
return vector_db.similarity_search(input_user, k=4)
|
||||
|
||||
def _update_history(self, input_user: str, search_results):
|
||||
some_context = ""
|
||||
pages = []
|
||||
for result in search_results:
|
||||
pages.append(str(result.metadata['page']))
|
||||
some_context += result.page_content + "\n\n"
|
||||
self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
|
||||
self.history.append({"role": "user", "content": input_user})
|
||||
return pages
|
||||
|
||||
def _get_completion(self):
|
||||
return client.chat.completions.create(
|
||||
model="local-model",
|
||||
messages=self.history,
|
||||
temperature=0.7,
|
||||
stream=True,
|
||||
)
|
||||
|
88
backend/vector_db_manager.py
Normal file
88
backend/vector_db_manager.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
|
||||
class VectorDbManager:
|
||||
def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
|
||||
self.embedding_function = embedding_function
|
||||
self.embedding_name = embedding_name
|
||||
self.db_directory = db_directory
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
|
||||
def create_vector_store_from_pdf(self, pdf_path):
|
||||
"""
|
||||
create a chroma vector store from a pdf file path
|
||||
store the vector store in the db_directory/pdf_name
|
||||
where pdf_name is the name of the pdf file
|
||||
|
||||
:param pdf_path:
|
||||
:return:
|
||||
"""
|
||||
pdf_path = Path(pdf_path)
|
||||
pdf_name = pdf_path.name
|
||||
vector_directory = self.db_directory/self.embedding_name/pdf_name
|
||||
|
||||
if os.path.isdir(vector_directory):
|
||||
print(f"{vector_directory} found, not recreating a vector store")
|
||||
return 0
|
||||
|
||||
print(f"creating vector store for {vector_directory}")
|
||||
file = PyPDFLoader(pdf_path)
|
||||
|
||||
docs = []
|
||||
pages = file.load_and_split()
|
||||
for j, page in enumerate(pages):
|
||||
docs.append(page)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=64,
|
||||
length_function=len,
|
||||
is_separator_regex=False,
|
||||
)
|
||||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
||||
print("vector store created")
|
||||
print(vectorstore)
|
||||
|
||||
def create_vector_store_from_latex(self, latex_path: Path):
|
||||
"""
|
||||
create a chroma vector store from a latex file path
|
||||
store the vector store in the db_directory/doc_name
|
||||
where doc_name is the name of the latex file
|
||||
|
||||
:param latex_path:
|
||||
:return:
|
||||
"""
|
||||
doc_name = latex_path.name
|
||||
vector_directory = self.db_directory/self.embedding_name/doc_name
|
||||
|
||||
if os.path.isdir(vector_directory):
|
||||
print(f"{vector_directory} found, not recreating a vector store")
|
||||
return 0
|
||||
|
||||
print(f"creating vector store for {vector_directory}")
|
||||
|
||||
with open(latex_path, mode="r") as file:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
|
||||
docs = text_splitter.split_document(file.read())
|
||||
|
||||
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
||||
|
||||
def get_chroma(self, doc_name):
|
||||
"""
|
||||
get the chroma vector store for a given document name
|
||||
|
||||
:param doc_name:
|
||||
:return:
|
||||
"""
|
||||
vector_directory = self.db_directory/self.embedding_name/doc_name
|
||||
return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)
|
||||
|
BIN
documents/pdfs/basic-laws-book-2016.pdf
Normal file
BIN
documents/pdfs/basic-laws-book-2016.pdf
Normal file
Binary file not shown.
BIN
documents/pdfs/corr_exam.pdf
Normal file
BIN
documents/pdfs/corr_exam.pdf
Normal file
Binary file not shown.
BIN
documents/pdfs/integration.pdf
Normal file
BIN
documents/pdfs/integration.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
front_end/corr_exam.pdf
Normal file
BIN
front_end/corr_exam.pdf
Normal file
Binary file not shown.
BIN
front_end/flagged/PDF File/bc51931986aa4ff95e72/integration.pdf
Normal file
BIN
front_end/flagged/PDF File/bc51931986aa4ff95e72/integration.pdf
Normal file
Binary file not shown.
9
front_end/flagged/log.csv
Normal file
9
front_end/flagged/log.csv
Normal file
|
@ -0,0 +1,9 @@
|
|||
PDF File,output,flag,username,timestamp
|
||||
flagged\PDF File\bc51931986aa4ff95e72\integration.pdf,"'
|
||||
<iframe
|
||||
src=""http://localhost:8000/pdfs/integration.pdf""
|
||||
:width=""1000px""
|
||||
:height=""2000px""
|
||||
frameborder=""0""
|
||||
></iframe>
|
||||
",,,2024-04-15 02:34:48.167890
|
|
134
front_end/main.py
Normal file
134
front_end/main.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
import gradio as gr
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
|
||||
from backend.vector_db_manager import VectorDbManager
|
||||
from backend.inference import InferenceInstance
|
||||
import time
|
||||
|
||||
|
||||
def get_accessible_port():
|
||||
from socket import socket
|
||||
|
||||
with socket() as s:
|
||||
s.bind(('', 0))
|
||||
return int(s.getsockname()[1])
|
||||
|
||||
|
||||
port = get_accessible_port()
|
||||
|
||||
|
||||
# Launch a simple HTTP server to serve the PDF files
|
||||
|
||||
|
||||
def start_server():
|
||||
command = ['python', '-m', 'http.server', f"{port}"]
|
||||
# Set the working directory to the documents folder to serve the PDF files
|
||||
os.chdir(Path(os.getcwd()).parent / "documents")
|
||||
subprocess.Popen(command)
|
||||
# Return to the original working directory
|
||||
os.chdir(Path(os.getcwd()).parent / "front_end")
|
||||
|
||||
|
||||
# Start the server
|
||||
start_server()
|
||||
|
||||
# Create VectorDbManager and Inference instance
|
||||
|
||||
embedding_func = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large", model_kwargs={'device': 'cuda'})
|
||||
base_db_directory = Path(r"../documents/vector_db")
|
||||
vector_db_manager = VectorDbManager(embedding_name="multilingual-e5-large", embedding_function=embedding_func, chunk_size=512, db_directory=base_db_directory)
|
||||
inference_instance = InferenceInstance(vector_db_manager=vector_db_manager, nb_chunks_retrieved=4)
|
||||
|
||||
|
||||
user_message_global = ""
|
||||
|
||||
|
||||
def user(user_message, history):
|
||||
global user_message_global
|
||||
user_message_global = user_message
|
||||
return "", history + [[user_message, None]]
|
||||
|
||||
|
||||
def bot(history):
|
||||
global user_message_global, doc_path
|
||||
|
||||
if doc_path != "":
|
||||
print("FOUND DOC_PATH")
|
||||
vector_db_manager.create_vector_store_from_pdf(doc_path)
|
||||
else:
|
||||
print("NOT FOUND DOC_PATH")
|
||||
|
||||
bot_message = inference_instance.get_next_token(user_message_global, doc_path.split("\\")[-1])
|
||||
history[-1][1] = ""
|
||||
for message in bot_message:
|
||||
history[-1][1] = message
|
||||
time.sleep(0.05)
|
||||
yield history
|
||||
|
||||
|
||||
def update_path(p):
|
||||
"""Update the global variable doc_path with the selected PDF path"""
|
||||
global doc_path
|
||||
doc_path = str(p)
|
||||
print(f"Selected PDF path: {doc_path}")
|
||||
|
||||
|
||||
def pdf_viewer(pdf_file):
|
||||
"""Display the PDF file in an HTML viewer"""
|
||||
pdf_path = Path(pdf_file)
|
||||
pdf_working_dir = Path(os.getcwd()).parent / "documents" / "pdfs"
|
||||
|
||||
# Check if the PDF file is in the working directory
|
||||
if not (pdf_working_dir / pdf_path.name).exists():
|
||||
return f"""<h1>File {pdf_path.name} not found in the working directory</h1>
|
||||
<p>You can only access PDFs that are inside {pdf_working_dir}</p>"""
|
||||
|
||||
# Create the HTML code for the PDF viewer
|
||||
return f"""
|
||||
<iframe
|
||||
src="http://localhost:{port}/pdfs/{pdf_path.name}"
|
||||
width="100%"
|
||||
height="800px"
|
||||
style="border:none;"
|
||||
></iframe>
|
||||
"""
|
||||
|
||||
|
||||
# Define main Gradio tab
|
||||
with gr.Blocks() as main_tab:
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
pdf_output = gr.HTML()
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
file_input = gr.File(label="Select a PDF file")
|
||||
|
||||
with gr.Column():
|
||||
with gr.Group():
|
||||
chatbot = gr.Chatbot(scale=2)
|
||||
msg = gr.Textbox(scale=2)
|
||||
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
||||
bot, chatbot, chatbot
|
||||
)
|
||||
|
||||
file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output)
|
||||
file_input.upload(update_path, inputs=file_input)
|
||||
|
||||
|
||||
# Define options tab
|
||||
with gr.Blocks() as options_tab:
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
with gr.Column(scale=12):
|
||||
# TODO: Add options for the inference instance
|
||||
gr.Textbox(label="Options", scale=2)
|
||||
|
||||
|
||||
app = gr.TabbedInterface([main_tab, options_tab], ["Main", "Options"])
|
||||
app.queue()
|
||||
app.launch()
|
21
front_end/test.html
Normal file
21
front_end/test.html
Normal file
|
@ -0,0 +1,21 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>PDF Viewer</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<!--
|
||||
Place the following <div> element where you want the PDF to be displayed in your website. You can change the size using the width and height attributes.
|
||||
-->
|
||||
<div>
|
||||
|
||||
<iframe
|
||||
src="C:\Users\CLEME\Pictures\corr_exam.pdf"
|
||||
width="500"
|
||||
height="678"
|
||||
>
|
||||
</iframe>
|
||||
</div>
|
||||
|
||||
</body>
|
3
front_end/test.py
Normal file
3
front_end/test.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
import os
|
||||
print("---")
|
||||
print(os.getcwd())
|
Loading…
Add table
Add a link
Reference in a new issue