ai_pdf/backend/pdf_to_mmd.py
Crizomb 18f35b28c2 * added log tab
* added references text box
* added options to choose embedding models
2024-04-20 12:54:24 +02:00

37 lines
1.1 KiB
Python

import subprocess
from pathlib import Path
import time
from gradio import Info
def pdf_to_mmd(path_input: str):
"""
Convert a PDF file to MMD format using the Nougat library
https://github.com/facebookresearch/nougat
stream stderr to the front end
"""
text = f"Converting {path_input} to LaTex, " \
f"it can take some time especially for big documents check progress in your terminal." \
f"Wait until the conversion is done to ask questions to the models."
print(text)
Info(text)
output_dir = "../documents/mmds"
command = ['nougat', path_input, "-o", output_dir]
subprocess.run(command)
time.sleep(1)
# Change the math delimiter to the common delimiter used in MMD
with open(f"{output_dir}/{str(Path(path_input).stem)}.mmd", "r+") as doc:
content = doc.read()
content = content.replace(r"\[", "$$").replace(r"\]", "$$")
content = content.replace(r"\(", "$").replace(r"\)", "$")
# delete the content of the file
doc.seek(0)
doc.truncate()
doc.write(content)