VectorChromaQueryEngine

autogen.agents.experimental.VectorChromaQueryEngine #

VectorChromaQueryEngine(db_path=None, embedding_function=None, metadata=None, llm=None, collection_name=None)

This engine leverages Chromadb to persist document embeddings in a named collection and LlamaIndex's VectorStoreIndex to efficiently index and retrieve documents, and generate an answer in response to natural language queries. The Chromadb collection serves as the storage layer, while the collection name uniquely identifies the set of documents within the persistent database.

This implements the autogen.agentchat.contrib.rag.RAGQueryEngine protocol.

Initializes the VectorChromaQueryEngine with db_path, metadata, and embedding function and llm. Args: db_path: The file system path where Chromadb will store its persistent data. If not specified, the default directory "./chroma" is used. embedding_function: A callable that converts text into vector embeddings. Default embedding uses Sentence Transformers model all-MiniLM-L6-v2. For more embeddings that ChromaDB support, please refer to embeddings metadata: A dictionary containing configuration parameters for the Chromadb collection. This metadata is typically used to configure the HNSW indexing algorithm. For more details about the default metadata, please refer to HNSW configuration llm: LLM model used by LlamaIndex for query processing. You can find more supported LLMs at LLM collection_name (str): The unique name for the Chromadb collection. If omitted, a constant name will be used. Populate this to reuse previous ingested data.

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def __init__(  # type: ignore[no-any-unimported]
    self,
    db_path: Optional[str] = None,
    embedding_function: "Optional[EmbeddingFunction[Any]]" = None,
    metadata: Optional[dict[str, Any]] = None,
    llm: Optional["LLM"] = None,
    collection_name: Optional[str] = None,
) -> None:
    """
    Initializes the VectorChromaQueryEngine with db_path, metadata, and embedding function and llm.
    Args:
        db_path: The file system path where Chromadb will store its persistent data.
            If not specified, the default directory "./chroma" is used.
        embedding_function: A callable that converts text into vector embeddings. Default embedding uses Sentence Transformers model all-MiniLM-L6-v2.
            For more embeddings that ChromaDB support, please refer to [embeddings](https://docs.trychroma.com/docs/embeddings/embedding-functions)
        metadata: A dictionary containing configuration parameters for the Chromadb collection.
            This metadata is typically used to configure the HNSW indexing algorithm.
            For more details about the default metadata, please refer to [HNSW configuration](https://cookbook.chromadb.dev/core/configuration/#hnsw-configuration)
        llm: LLM model used by LlamaIndex for query processing.
             You can find more supported LLMs at [LLM](https://docs.llamaindex.ai/en/stable/module_guides/models/llms/)
        collection_name (str): The unique name for the Chromadb collection. If omitted, a constant name will be used. Populate this to reuse previous ingested data.
    """
    self.llm: LLM = llm or OpenAI(model="gpt-4o", temperature=0.0)  # type: ignore[no-any-unimported]
    self.embedding_function: EmbeddingFunction[Any] = embedding_function or DefaultEmbeddingFunction()  # type: ignore[no-any-unimported,assignment]
    self.metadata: dict[str, Any] = metadata or {
        "hnsw:space": "ip",
        "hnsw:construction_ef": 30,
        "hnsw:M": 32,
    }
    self.client = chromadb.PersistentClient(path=db_path or "./chroma")
    self.collection_name: Optional[str] = collection_name

    self.connect_db()

llm `instance-attribute` #

llm = llm or OpenAI(model='gpt-4o', temperature=0.0)

embedding_function `instance-attribute` #

embedding_function = embedding_function or DefaultEmbeddingFunction()

metadata `instance-attribute` #

metadata = metadata or {'hnsw:space': 'ip', 'hnsw:construction_ef': 30, 'hnsw:M': 32}

client `instance-attribute` #

client = PersistentClient(path=db_path or './chroma')

collection_name `instance-attribute` #

collection_name = collection_name

connect_db #

connect_db(*args, **kwargs)

Establish a connection to the Chromadb database and initialize the collection.

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def connect_db(self, *args: Any, **kwargs: Any) -> bool:
    """
    Establish a connection to the Chromadb database and initialize the collection.
    """

    self.collection_name = self.collection_name or DEFAULT_COLLECTION_NAME

    if self._collection_exists(self.collection_name):
        logger.info(f"Using existing collection {self.collection_name} from the database.")
    else:
        logger.info(f"Creating new collection {self.collection_name} in the database.")

    self.collection = self.client.create_collection(
        name=self.collection_name,
        embedding_function=self.embedding_function,
        metadata=self.metadata,
        get_or_create=True,  # If collection already exists, get the collection
    )
    self.index = self._create_index(self.collection)

    return True

query #

query(question)

Retrieve information from indexed documents by processing a natural language query.

PARAMETER	DESCRIPTION
`question`	A natural language query string used to search the indexed documents. TYPE: `str`

RETURNS	DESCRIPTION
`str`	A string containing the response generated by LLM.

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def query(self, question: str) -> str:
    """
    Retrieve information from indexed documents by processing a natural language query.

    Args:
        question: A natural language query string used to search the indexed documents.

    Returns:
        A string containing the response generated by LLM.
    """
    self.validate_query_index()
    self.query_engine = self.index.as_query_engine(llm=self.llm)
    response = self.query_engine.query(question)

    if str(response) == EMPTY_RESPONSE_TEXT:
        return EMPTY_RESPONSE_REPLY

    return str(response)

add_docs #

add_docs(new_doc_dir=None, new_doc_paths_or_urls=None)

Add additional documents to the existing vector index.

Loads new Docling-parsed Markdown files from a specified directory or a list of file paths and inserts them into the current index for future queries.

PARAMETER	DESCRIPTION
`new_doc_dir`	The directory path from which to load additional documents. If provided, all eligible files in this directory are loaded. TYPE: `Optional[Union[Path, str]]` DEFAULT: `None`
`new_doc_paths_or_urls`	A list of file paths specifying additional documents to load. Each file should be a Docling-parsed Markdown file. TYPE: `Optional[Sequence[Union[Path, str]]]` DEFAULT: `None`

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def add_docs(
    self,
    new_doc_dir: Optional[Union[Path, str]] = None,
    new_doc_paths_or_urls: Optional[Sequence[Union[Path, str]]] = None,
) -> None:
    """
    Add additional documents to the existing vector index.

    Loads new Docling-parsed Markdown files from a specified directory or a list of file paths
    and inserts them into the current index for future queries.

    Args:
        new_doc_dir: The directory path from which to load additional documents.
            If provided, all eligible files in this directory are loaded.
        new_doc_paths_or_urls: A list of file paths specifying additional documents to load.
            Each file should be a Docling-parsed Markdown file.
    """
    self.validate_query_index()
    new_doc_dir = new_doc_dir or ""
    new_doc_paths = new_doc_paths_or_urls or []
    new_docs = self._load_doc(input_dir=new_doc_dir, input_docs=new_doc_paths)
    for doc in new_docs:
        self.index.insert(doc)

get_collection_name #

get_collection_name()

Get the name of the collection used by the query engine.

RETURNS	DESCRIPTION
`str`	The name of the collection.

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def get_collection_name(self) -> str:
    """
    Get the name of the collection used by the query engine.

    Returns:
        The name of the collection.
    """
    if self.collection_name:
        return self.collection_name
    else:
        raise ValueError("Collection name not set.")

validate_query_index #

validate_query_index()

Ensures an index exists

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def validate_query_index(self) -> None:
    """Ensures an index exists"""
    if not hasattr(self, "index"):
        raise Exception("Query index is not initialized. Please ingest some documents before querying.")

init_db #

init_db(new_doc_dir=None, new_doc_paths_or_urls=None, *args, **kwargs)

Not required nor implemented for VectorChromaQueryEngine

Source code in autogen/agents/experimental/document_agent/chroma_query_engine.py

def init_db(
    self,
    new_doc_dir: Optional[Union[Path, str]] = None,
    new_doc_paths_or_urls: Optional[Sequence[Union[Path, str]]] = None,
    *args: Any,
    **kwargs: Any,
) -> bool:
    """Not required nor implemented for VectorChromaQueryEngine"""
    raise NotImplementedError("Method, init_db, not required nor implemented for VectorChromaQueryEngine")

VectorChromaQueryEngine

autogen.agents.experimental.VectorChromaQueryEngine #

llm instance-attribute #

embedding_function instance-attribute #

metadata instance-attribute #

client instance-attribute #

collection_name instance-attribute #

connect_db #

query #

add_docs #

get_collection_name #

validate_query_index #

init_db #

llm `instance-attribute` #

embedding_function `instance-attribute` #

metadata `instance-attribute` #

client `instance-attribute` #

collection_name `instance-attribute` #