* add conversion from pdf to latex-like (.mmd) format with nougat
* change vector_db_manager.py to handle .mmd * add "conversion" tab * add math mode checkbox in maintab
This commit is contained in:
parent
356f72fedc
commit
11b92baaa8
17 changed files with 247 additions and 33 deletions
|
@ -17,28 +17,32 @@ class InferenceInstance:
|
|||
self.nb_chunks_retrieved = nb_chunks_retrieved
|
||||
|
||||
def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
|
||||
is_pdf = doc_name.endswith(".pdf")
|
||||
print(f"doc_name: {doc_name}")
|
||||
new_assistant_message = {"role": "assistant", "content": ""}
|
||||
search_results = self._get_search_results(input_user, doc_name)
|
||||
print(f"search results: {search_results}")
|
||||
pages = self._update_history(input_user, search_results)
|
||||
pages = self._update_history(input_user, search_results, is_pdf)
|
||||
pages_info = f"pages used : p" + " p".join(pages)
|
||||
print(f"history: {self.history}")
|
||||
completion = self._get_completion()
|
||||
|
||||
for chunk in completion:
|
||||
new_assistant_message["content"] += chunk.choices[0].delta.content
|
||||
yield pages_info + " " + new_assistant_message["content"]
|
||||
if chunk.choices[0].delta.content:
|
||||
new_assistant_message["content"] += chunk.choices[0].delta.content
|
||||
yield pages_info + "\n\n " + new_assistant_message["content"]
|
||||
|
||||
def _get_search_results(self, input_user: str, doc_name: str):
|
||||
print(f"input_user: {input_user}")
|
||||
vector_db = self.vector_db_manager.get_chroma(doc_name)
|
||||
return vector_db.similarity_search(input_user, k=4)
|
||||
|
||||
def _update_history(self, input_user: str, search_results):
|
||||
def _update_history(self, input_user: str, search_results, is_pdf):
|
||||
some_context = ""
|
||||
pages = []
|
||||
for result in search_results:
|
||||
pages.append(str(result.metadata['page']))
|
||||
if is_pdf:
|
||||
pages.append(str(result.metadata['page']))
|
||||
some_context += result.page_content + "\n\n"
|
||||
self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
|
||||
self.history.append({"role": "user", "content": input_user})
|
||||
|
|
16
backend/pdf_to_mmd.py
Normal file
16
backend/pdf_to_mmd.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import subprocess
|
||||
|
||||
|
||||
def pdf_to_mmd(path_input: str):
|
||||
"""
|
||||
Convert a PDF file to MMD format using the Nougat library
|
||||
https://github.com/facebookresearch/nougat
|
||||
|
||||
stream stderr to the front end
|
||||
"""
|
||||
output_dir = "../documents/mmds"
|
||||
command = ['nougat', path_input, "-o", output_dir]
|
||||
subprocess.run(command)
|
||||
|
||||
|
||||
|
|
@ -14,7 +14,6 @@ class VectorDbManager:
|
|||
self.db_directory = db_directory
|
||||
self.chunk_size = chunk_size
|
||||
|
||||
|
||||
def create_vector_store_from_pdf(self, pdf_path):
|
||||
"""
|
||||
create a chroma vector store from a pdf file path
|
||||
|
@ -26,7 +25,7 @@ class VectorDbManager:
|
|||
"""
|
||||
pdf_path = Path(pdf_path)
|
||||
pdf_name = pdf_path.name
|
||||
vector_directory = self.db_directory/self.embedding_name/pdf_name
|
||||
vector_directory = self.db_directory / self.embedding_name / pdf_name
|
||||
|
||||
if os.path.isdir(vector_directory):
|
||||
print(f"{vector_directory} found, not recreating a vector store")
|
||||
|
@ -49,7 +48,7 @@ class VectorDbManager:
|
|||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
||||
print("vector store created")
|
||||
print("pdf vector store created")
|
||||
print(vectorstore)
|
||||
|
||||
def create_vector_store_from_latex(self, latex_path: Path):
|
||||
|
@ -62,7 +61,7 @@ class VectorDbManager:
|
|||
:return:
|
||||
"""
|
||||
doc_name = latex_path.name
|
||||
vector_directory = self.db_directory/self.embedding_name/doc_name
|
||||
vector_directory = self.db_directory / self.embedding_name / doc_name
|
||||
|
||||
if os.path.isdir(vector_directory):
|
||||
print(f"{vector_directory} found, not recreating a vector store")
|
||||
|
@ -71,10 +70,13 @@ class VectorDbManager:
|
|||
print(f"creating vector store for {vector_directory}")
|
||||
|
||||
with open(latex_path, mode="r") as file:
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
|
||||
docs = text_splitter.split_document(file.read())
|
||||
text_splitter = RecursiveCharacterTextSplitter.from_language(Language.MARKDOWN, chunk_size=self.chunk_size, chunk_overlap=64)
|
||||
texts = text_splitter.split_text(file.read())
|
||||
|
||||
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
||||
print(texts)
|
||||
vectorstore = Chroma.from_texts(texts, self.embedding_function, persist_directory=vector_directory)
|
||||
print("latex vector store created")
|
||||
print(vectorstore)
|
||||
|
||||
def get_chroma(self, doc_name):
|
||||
"""
|
||||
|
@ -83,6 +85,5 @@ class VectorDbManager:
|
|||
:param doc_name:
|
||||
:return:
|
||||
"""
|
||||
vector_directory = self.db_directory/self.embedding_name/doc_name
|
||||
vector_directory = self.db_directory / self.embedding_name / doc_name
|
||||
return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue