* add conversion from pdf to latex-like (.mmd) format with nougat

* change vector_db_manager.py to handle .mmd
* add "conversion" tab
* add math mode checkbox in maintab
This commit is contained in:
Crizomb 2024-04-18 08:00:03 +02:00
parent 356f72fedc
commit 11b92baaa8
17 changed files with 247 additions and 33 deletions

Binary file not shown.

View file

@ -1,9 +0,0 @@
PDF File,output,flag,username,timestamp
flagged\PDF File\bc51931986aa4ff95e72\integration.pdf,"'
<iframe
src=""http://localhost:8000/pdfs/integration.pdf""
:width=""1000px""
:height=""2000px""
frameborder=""0""
></iframe>
",,,2024-04-15 02:34:48.167890
1 PDF File output flag username timestamp
2 flagged\PDF File\bc51931986aa4ff95e72\integration.pdf ' <iframe src="http://localhost:8000/pdfs/integration.pdf" :width="1000px" :height="2000px" frameborder="0" ></iframe> 2024-04-15 02:34:48.167890

View file

@ -7,6 +7,7 @@ from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from backend.vector_db_manager import VectorDbManager
from backend.inference import InferenceInstance
from backend.pdf_to_mmd import pdf_to_mmd
import time
@ -22,8 +23,6 @@ port = get_accessible_port()
# Launch a simple HTTP server to serve the PDF files
def start_server():
command = ['python', '-m', 'http.server', f"{port}"]
# Set the working directory to the documents folder to serve the PDF files
@ -57,12 +56,19 @@ def bot(history):
global user_message_global, doc_path
if doc_path != "":
print("FOUND DOC_PATH")
vector_db_manager.create_vector_store_from_pdf(doc_path)
print(f"FOUND DOC_PATH {doc_path}")
doc_extension = doc_path.split(".")[-1]
if doc_extension == "mmd":
vector_db_manager.create_vector_store_from_latex(Path(doc_path))
elif doc_extension == "pdf":
vector_db_manager.create_vector_store_from_pdf(doc_path)
else:
print(f"Unsupported extension: {doc_extension}")
else:
print("NOT FOUND DOC_PATH")
bot_message = inference_instance.get_next_token(user_message_global, doc_path.split("\\")[-1])
doc_name = Path(doc_path).stem + ".mmd" if math_checkbox.value else Path(doc_path).name
bot_message = inference_instance.get_next_token(user_message_global, doc_name)
history[-1][1] = ""
for message in bot_message:
history[-1][1] = message
@ -70,11 +76,21 @@ def bot(history):
yield history
def update_path(p):
def update_path(p, checked):
"""Update the global variable doc_path with the selected PDF path"""
print("Updating path")
global doc_path
doc_path = str(p)
print(f"Selected PDF path: {doc_path}")
name = Path(p).name
stem = Path(p).stem
if checked:
if not (Path(r"../documents/mmds") / (stem + ".mmd")).exists():
print(f"Converting {name} to MMD")
pdf_to_mmd(r"../documents/pdfs/" + name)
print(f"Selected DOC path: {stem}.mmd")
doc_path = r"../documents/mmds/" + stem + ".mmd"
else:
print(f"Selected DOC path: {name}")
doc_path = str(p)
def pdf_viewer(pdf_file):
@ -107,6 +123,7 @@ with gr.Blocks() as main_tab:
with gr.Row():
with gr.Column(scale=12):
file_input = gr.File(label="Select a PDF file")
math_checkbox = gr.Checkbox(label="Enable math mode (your pdf file will be converted to some latex-like format for the chatbot to understand it better)")
with gr.Column():
with gr.Group():
@ -117,7 +134,7 @@ with gr.Blocks() as main_tab:
)
file_input.change(pdf_viewer, inputs=file_input, outputs=pdf_output)
file_input.upload(update_path, inputs=file_input)
file_input.upload(update_path, inputs=[file_input, math_checkbox])
# Define options tab
@ -129,6 +146,21 @@ with gr.Blocks() as options_tab:
gr.Textbox(label="Options", scale=2)
app = gr.TabbedInterface([main_tab, options_tab], ["Main", "Options"])
# Define conversion tab
with gr.Blocks() as conversion_tab:
with gr.Column():
file_input = gr.File(label="Select a PDF file to convert to MMD")
html_output = gr.HTML(label="Output")
def upload_func(file_input):
name = Path(file_input).name
file_path = fr"../documents/pdfs/{name}"
pdf_to_mmd(file_path)
file_input.upload(upload_func, inputs=file_input)
app = gr.TabbedInterface([main_tab, options_tab, conversion_tab], ["Main", "Options", "Conversion"])
app.queue()
app.launch()