ai_pdf/backend/vector_db_manager.py

from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
import os


class VectorDbManager:
    def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
        self.embedding_function = embedding_function
        self.embedding_name = embedding_name
        self.db_directory = db_directory
        self.chunk_size = chunk_size


    def create_vector_store_from_pdf(self, pdf_path):
        """
        create a chroma vector store from a pdf file path
        store the vector store in the db_directory/pdf_name
        where pdf_name is the name of the pdf file

        :param pdf_path:
        :return:
        """
        pdf_path = Path(pdf_path)
        pdf_name = pdf_path.name
        vector_directory = self.db_directory/self.embedding_name/pdf_name

        if os.path.isdir(vector_directory):
            print(f"{vector_directory} found, not recreating a vector store")
            return 0

        print(f"creating vector store for {vector_directory}")
        file = PyPDFLoader(pdf_path)

        docs = []
        pages = file.load_and_split()
        for j, page in enumerate(pages):
            docs.append(page)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=64,
            length_function=len,
            is_separator_regex=False,
        )
        docs = text_splitter.split_documents(docs)

        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
        print("vector store created")
        print(vectorstore)

    def create_vector_store_from_latex(self, latex_path: Path):
        """
        create a chroma vector store from a latex file path
        store the vector store in the db_directory/doc_name
        where doc_name is the name of the latex file

        :param latex_path:
        :return:
        """
        doc_name = latex_path.name
        vector_directory = self.db_directory/self.embedding_name/doc_name

        if os.path.isdir(vector_directory):
            print(f"{vector_directory} found, not recreating a vector store")
            return 0

        print(f"creating vector store for {vector_directory}")

        with open(latex_path, mode="r") as file:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
            docs = text_splitter.split_document(file.read())

        vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)

    def get_chroma(self, doc_name):
        """
        get the chroma vector store for a given document name

        :param doc_name:
        :return:
        """
        vector_directory = self.db_directory/self.embedding_name/doc_name
        return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)