* add conversion from pdf to latex-like (.mmd) format with nougat

* change vector_db_manager.py to handle .mmd
* add "conversion" tab
* add math mode checkbox in maintab
This commit is contained in:
Crizomb 2024-04-18 08:00:03 +02:00
parent 356f72fedc
commit 11b92baaa8
17 changed files with 247 additions and 33 deletions

View file

@ -14,7 +14,6 @@ class VectorDbManager:
self.db_directory = db_directory
self.chunk_size = chunk_size
def create_vector_store_from_pdf(self, pdf_path):
"""
create a chroma vector store from a pdf file path
@ -26,7 +25,7 @@ class VectorDbManager:
"""
pdf_path = Path(pdf_path)
pdf_name = pdf_path.name
vector_directory = self.db_directory/self.embedding_name/pdf_name
vector_directory = self.db_directory / self.embedding_name / pdf_name
if os.path.isdir(vector_directory):
print(f"{vector_directory} found, not recreating a vector store")
@ -49,7 +48,7 @@ class VectorDbManager:
docs = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
print("vector store created")
print("pdf vector store created")
print(vectorstore)
def create_vector_store_from_latex(self, latex_path: Path):
@ -62,7 +61,7 @@ class VectorDbManager:
:return:
"""
doc_name = latex_path.name
vector_directory = self.db_directory/self.embedding_name/doc_name
vector_directory = self.db_directory / self.embedding_name / doc_name
if os.path.isdir(vector_directory):
print(f"{vector_directory} found, not recreating a vector store")
@ -71,10 +70,13 @@ class VectorDbManager:
print(f"creating vector store for {vector_directory}")
with open(latex_path, mode="r") as file:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
docs = text_splitter.split_document(file.read())
text_splitter = RecursiveCharacterTextSplitter.from_language(Language.MARKDOWN, chunk_size=self.chunk_size, chunk_overlap=64)
texts = text_splitter.split_text(file.read())
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
print(texts)
vectorstore = Chroma.from_texts(texts, self.embedding_function, persist_directory=vector_directory)
print("latex vector store created")
print(vectorstore)
def get_chroma(self, doc_name):
"""
@ -83,6 +85,5 @@ class VectorDbManager:
:param doc_name:
:return:
"""
vector_directory = self.db_directory/self.embedding_name/doc_name
vector_directory = self.db_directory / self.embedding_name / doc_name
return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)