ai_pdf/backend/pdf_to_mmd.py

30 lines
836 B
Python

import subprocess
from pathlib import Path
import time
def pdf_to_mmd(path_input: str):
"""
Convert a PDF file to MMD format using the Nougat library
https://github.com/facebookresearch/nougat
stream stderr to the front end
"""
output_dir = "../documents/mmds"
command = ['nougat', path_input, "-o", output_dir]
subprocess.run(command)
time.sleep(1)
# Change the math delimiter to the common delimiter used in MMD
with open(f"{output_dir}/{str(Path(path_input).stem)}.mmd", "r+") as doc:
content = doc.read()
print(content)
content = content.replace(r"\[", "$$").replace(r"\]", "$$")
content = content.replace(r"\(", "$").replace(r"\)", "$")
# delete the content of the file
doc.seek(0)
doc.truncate()
doc.write(content)