* added log tab

* added references text box * added options to choose embedding models
2024-04-20 12:54:24 +02:00 · 2024-04-20 12:54:24 +02:00 · 18f35b28c2
commit 18f35b28c2
parent eedbb1b81a
42 changed files with 911 additions and 441 deletions
--- a/backend/embeddings_manager.py
+++ b/backend/embeddings_manager.py
@ -0,0 +1,13 @@
+from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
+import torch
+
+# dict : huggingface url -> max token length (will be chunk size)
+MODELS_DICT = {"intfloat/multilingual-e5-large": 512,
+               "intfloat/multilingual-e5-large-instruct": 512}
+
+
+def get_embedding_model(name: str):
+    if name in MODELS_DICT:
+        return HuggingFaceEmbeddings(model_name=name, model_kwargs={'device': 'cuda'} if torch.cuda.is_available() else {})
+    else:
+        raise ValueError(f"Model {name} not found in the list of available models")
--- a/backend/inference.py
+++ b/backend/inference.py
@ -15,22 +15,22 @@ class InferenceInstance:
        self.vector_db_manager = vector_db_manager
        self.history = []
        self.nb_chunks_retrieved = nb_chunks_retrieved
+        flush_relevant_content()

    def get_next_token(self, input_user: str, doc_name: str) -> Iterator[Dict[str, str]]:
        is_pdf = doc_name.endswith(".pdf")
        print(f"doc_name: {doc_name}")
        new_assistant_message = {"role": "assistant", "content": ""}
        search_results = self._get_search_results(input_user, doc_name)
+        self._update_history(input_user, search_results, is_pdf)
        print(f"search results: {search_results}")
-        pages = self._update_history(input_user, search_results, is_pdf)
-        pages_info = f"pages used : p" + " p".join(pages)
-        print(f"history: {self.history}")
+
        completion = self._get_completion()

        for chunk in completion:
            if chunk.choices[0].delta.content:
                new_assistant_message["content"] += chunk.choices[0].delta.content
-                yield pages_info + "\n\n " + new_assistant_message["content"]
+                yield new_assistant_message["content"]

    def _get_search_results(self, input_user: str, doc_name: str):
        print(f"input_user: {input_user}")
@ -38,12 +38,21 @@ class InferenceInstance:
        return vector_db.similarity_search(input_user, k=4)

    def _update_history(self, input_user: str, search_results, is_pdf):
+        references_textbox_content = ""
        some_context = ""
        pages = []
        for result in search_results:
            if is_pdf:
                pages.append(str(result.metadata['page']))
            some_context += result.page_content + "\n\n"
+            pages_info = f'on page {result.metadata["page"]}' if is_pdf else 'in the document'
+            references_textbox_content += f"**Relevant content viewed {pages_info}**: \n\n" \
+                                          f" \n\n {result.page_content}\n\n" \
+                                          "-----------------------------------\n\n"
+
+            with open("../temp_file/relevant_content.mmd", "w") as f:
+                f.write(references_textbox_content)
+
        self.history.append({"role": "system", "content": f"relevant content for user question {some_context}"})
        self.history.append({"role": "user", "content": input_user})
        return pages
@ -56,3 +65,12 @@ class InferenceInstance:
            stream=True,
        )

+
+def read_relevant_content():
+    with open("../temp_file/relevant_content.mmd", "r") as f:
+        return f.read()
+
+
+def flush_relevant_content():
+    with open("../temp_file/relevant_content.mmd", "w") as f:
+        f.write("")
--- a/backend/logger.py
+++ b/backend/logger.py
@ -0,0 +1,33 @@
+import gradio as gr
+import sys
+
+
+class Logger:
+    """
+    Logger class to redirect the output to a file.
+    will be used to the log textbox in the frontend.
+
+    Adapted from  : https://github.com/gradio-app/gradio/issues/2362#issuecomment-1424446778
+    """
+    def __init__(self, filename):
+        self.terminal = sys.stdout
+        self.log = open(filename, "w")
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+
+    def isatty(self):
+        return False
+
+
+def read_logs():
+    sys.stdout.flush()
+    with open("../temp_file/output.log", "r") as f:
+        return f.read()
+
+
--- a/backend/pdf_to_mmd.py
+++ b/backend/pdf_to_mmd.py
@ -1,6 +1,7 @@
 import subprocess
 from pathlib import Path
 import time
+from gradio import Info


 def pdf_to_mmd(path_input: str):
@ -10,6 +11,13 @@ def pdf_to_mmd(path_input: str):

    stream stderr to the front end
    """
+    text = f"Converting {path_input} to LaTex, " \
+           f"it can take some time especially for big documents check progress in your terminal." \
+           f"Wait until the conversion is done to ask questions to the models."
+
+    print(text)
+    Info(text)
+
    output_dir = "../documents/mmds"
    command = ['nougat', path_input, "-o", output_dir]
    subprocess.run(command)
@ -17,7 +25,6 @@ def pdf_to_mmd(path_input: str):
    # Change the math delimiter to the common delimiter used in MMD
    with open(f"{output_dir}/{str(Path(path_input).stem)}.mmd", "r+") as doc:
        content = doc.read()
-        print(content)

        content = content.replace(r"\[", "$$").replace(r"\]", "$$")
        content = content.replace(r"\(", "$").replace(r"\)", "$")