[LCC-5] LangChain VectorStore 이해
임베딩을 통해 실수(float) 벡터값은 Vector Store에 저장을 한다. 그리고 사용자의 Query에 대해서도 벡터로 변환하여 Vector Store에 Sematic Search를 한다.
유사도가 높은 문서뭉치들을 프롬프트에 담아 LLM에 요청한다.
패키지
vectorstores 패키지는 langchain.vectorstores 패키지에서 langchain_community.vectorstores 패키지의 모듈을 re-export 하고 있다.
- 70 개가량의 벡터 저장소 구현체를 제공한다. https://python.langchain.com/v0.2/docs/integrations/vectorstores/
- 가장 많이 사용하는 것은 FAISS, Chroma 등이다.
Chroma 패키지는 내부적으로 chromadb 패키지를 사용하여 LangChain과 연동시켜주는 인터페이스이다.
- VectoreStore 생성시 "embedding_function", "collection_name"을 지정할 수 있다.
from chromadb import Client
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
# 1. OpenAI Embeddings 초기화
openai_embedding = OpenAIEmbeddings()
# 2. Chroma 벡터스토어 클라이언트 생성
client = Client()
# 3. Chroma 벡터스토어 생성
vectorstore = Chroma(
embedding_function=openai_embedding, # OpenAI Embedding 사용
collection_name="my_openai_collection", # 컬렉션 이름
client=client # Chroma 클라이언트
)
# 4. 데이터 추가
documents = ["This is a test document.", "Another document for testing."]
vectorstore.add_texts(texts=documents)
# 5. 데이터 검색
query = "test"
results = vectorstore.similarity_search(query, k=2)
# 6. 검색 결과 출력
for result in results:
print(f"Document: {result.page_content}, Score: {result.score}")
또는 from_documents 클래스메서드를 이용하여 호출한다.
- SentenceTransformerEmbeddings 는 내부적으로 HuggingFaceEmbeddings 을 사용하고 디폴디 임베딩 모델로 "BAAI/bge-large-en"을 사용한다.
- load() --> split_documents() --> from_documents() 를 통해서
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
# 문서를 로드하고 청크로 분할합니다.
loader = TextLoader("./data/appendix-keywords.txt")
documents = loader.load()
# 문서를 청크로 분할합니다.
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# 오픈 소스 임베딩 함수를 생성합니다.
stf_embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Chroma에 로드합니다.
db = Chroma.from_documents(docs, stf_embeddings)
# 질의합니다.
query = "What is Word2Vec?"
docs = db.similarity_search(query)
# 결과를 출력합니다.
print(docs[0].page_content)
API
VectorStore 드라이버 구현체는 langchain_core의 VectoreStore를 상속받아 구현한다.
- VectorStore 를 상속받아 __init__ 구현
class Chroma(VectorStore):
def __init__(
self,
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
embedding_function: Optional[Embeddings] = None,
persist_directory: Optional[str] = None,
client_settings: Optional[chromadb.config.Settings] = None,
collection_metadata: Optional[Dict] = None,
client: Optional[chromadb.Client] = None,
relevance_score_fn: Optional[Callable[[float], float]] = None,
) -> None:
"""Initialize with a Chroma client."""
try:
import chromadb
import chromadb.config
except ImportError:
raise ImportError(
"Could not import chromadb python package. "
"Please install it with `pip install chromadb`."
)
if client is not None:
self._client_settings = client_settings
self._client = client
self._persist_directory = persist_directory
else:
if client_settings:
# If client_settings is provided with persist_directory specified,
# then it is "in-memory and persisting to disk" mode.
client_settings.persist_directory = (
persist_directory or client_settings.persist_directory
)
if client_settings.persist_directory is not None:
# Maintain backwards compatibility with chromadb < 0.4.0
major, minor, _ = chromadb.__version__.split(".")
if int(major) == 0 and int(minor) < 4:
client_settings.chroma_db_impl = "duckdb+parquet"
_client_settings = client_settings
elif persist_directory:
# Maintain backwards compatibility with chromadb < 0.4.0
major, minor, _ = chromadb.__version__.split(".")
if int(major) == 0 and int(minor) < 4:
_client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
)
else:
_client_settings = chromadb.config.Settings(is_persistent=True)
_client_settings.persist_directory = persist_directory
else:
_client_settings = chromadb.config.Settings()
self._client_settings = _client_settings
self._client = chromadb.Client(_client_settings)
self._persist_directory = (
_client_settings.persist_directory or persist_directory
)
self._embedding_function = embedding_function
self._collection = self._client.get_or_create_collection(
name=collection_name,
embedding_function=None,
metadata=collection_metadata,
)
self.override_relevance_score_fn = relevance_score_fn
- from_documents 는 classmethod 이면서 벡터객체를 반환한다.
- documents: splitted 된 document 리스트
- embedding: embedding model 인스턴스
@classmethod
def from_documents(
cls: Type[VST],
documents: List[Document],
embedding: Embeddings,
**kwargs: Any,
) -> VST:
"""Return VectorStore initialized from documents and embeddings.
Args:
documents: List of Documents to add to the vectorstore.
embedding: Embedding function to use.
**kwargs: Additional keyword arguments.
Returns:
VectorStore: VectorStore initialized from documents and embeddings.
"""
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs)
내부에서 클래스메서드인 from_texts를 호출한다. from_texts는 abstractmethod로 각 vectorstore에서 구현한다.
@classmethod
@abstractmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> VST:
"""Return VectorStore initialized from texts and embeddings.
Args:
texts: Texts to add to the vectorstore.
embedding: Embedding function to use.
metadatas: Optional list of metadatas associated with the texts.
Default is None.
kwargs: Additional keyword arguments.
Returns:
VectorStore: VectorStore initialized from texts and embeddings.
"""
- similarity_search 는 abstractmethod로 각 vectorstore에서 구현을 해야 한다.
@abstractmethod
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Input text.
k: Number of Documents to return. Defaults to 4.
**kwargs: Arguments to pass to the search method.
Returns:
List of Documents most similar to the query.
"""
- search 는 검색 타입을 선택한다. "similarity", "similarity_score_threshold", "mmr" 등이 있다.
- 반환값은 List[Document] 이다.
def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]:
"""Return docs most similar to query using a specified search type.
Args:
query: Input text
search_type: Type of search to perform. Can be "similarity",
"mmr", or "similarity_score_threshold".
**kwargs: Arguments to pass to the search method.
Returns:
List of Documents most similar to the query.
Raises:
ValueError: If search_type is not one of "similarity",
"mmr", or "similarity_score_threshold".
"""
if search_type == "similarity":
return self.similarity_search(query, **kwargs)
elif search_type == "similarity_score_threshold":
docs_and_similarities = self.similarity_search_with_relevance_scores(
query, **kwargs
)
return [doc for doc, _ in docs_and_similarities]
elif search_type == "mmr":
return self.max_marginal_relevance_search(query, **kwargs)
else:
raise ValueError(
f"search_type of {search_type} not allowed. Expected "
"search_type to be 'similarity', 'similarity_score_threshold'"
" or 'mmr'."
)
- as_retriever 는 VectorStore 에는 VectorStoreRetriever 를 생성하여 반환한다.
def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever:
Retriever 에 대해 다음 글에서 살펴보자.
<참조>
- 공식문서: https://python.langchain.com/v0.2/docs/integrations/vectorstores/
- LangChain KR: https://wikidocs.net/234013