88 lines
3.1 KiB
Python
88 lines
3.1 KiB
Python
from langchain_core.embeddings import Embeddings
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
from pathlib import Path
|
|
import os
|
|
|
|
|
|
class VectorDbManager:
|
|
def __init__(self, embedding_function: Embeddings, embedding_name: str, chunk_size: int, db_directory: Path):
|
|
self.embedding_function = embedding_function
|
|
self.embedding_name = embedding_name
|
|
self.db_directory = db_directory
|
|
self.chunk_size = chunk_size
|
|
|
|
|
|
def create_vector_store_from_pdf(self, pdf_path):
|
|
"""
|
|
create a chroma vector store from a pdf file path
|
|
store the vector store in the db_directory/pdf_name
|
|
where pdf_name is the name of the pdf file
|
|
|
|
:param pdf_path:
|
|
:return:
|
|
"""
|
|
pdf_path = Path(pdf_path)
|
|
pdf_name = pdf_path.name
|
|
vector_directory = self.db_directory/self.embedding_name/pdf_name
|
|
|
|
if os.path.isdir(vector_directory):
|
|
print(f"{vector_directory} found, not recreating a vector store")
|
|
return 0
|
|
|
|
print(f"creating vector store for {vector_directory}")
|
|
file = PyPDFLoader(pdf_path)
|
|
|
|
docs = []
|
|
pages = file.load_and_split()
|
|
for j, page in enumerate(pages):
|
|
docs.append(page)
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=self.chunk_size,
|
|
chunk_overlap=64,
|
|
length_function=len,
|
|
is_separator_regex=False,
|
|
)
|
|
docs = text_splitter.split_documents(docs)
|
|
|
|
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
|
print("vector store created")
|
|
print(vectorstore)
|
|
|
|
def create_vector_store_from_latex(self, latex_path: Path):
|
|
"""
|
|
create a chroma vector store from a latex file path
|
|
store the vector store in the db_directory/doc_name
|
|
where doc_name is the name of the latex file
|
|
|
|
:param latex_path:
|
|
:return:
|
|
"""
|
|
doc_name = latex_path.name
|
|
vector_directory = self.db_directory/self.embedding_name/doc_name
|
|
|
|
if os.path.isdir(vector_directory):
|
|
print(f"{vector_directory} found, not recreating a vector store")
|
|
return 0
|
|
|
|
print(f"creating vector store for {vector_directory}")
|
|
|
|
with open(latex_path, mode="r") as file:
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=100)
|
|
docs = text_splitter.split_document(file.read())
|
|
|
|
vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=vector_directory)
|
|
|
|
def get_chroma(self, doc_name):
|
|
"""
|
|
get the chroma vector store for a given document name
|
|
|
|
:param doc_name:
|
|
:return:
|
|
"""
|
|
vector_directory = self.db_directory/self.embedding_name/doc_name
|
|
return Chroma(persist_directory=vector_directory, embedding_function=self.embedding_function)
|
|
|