-
Notifications
You must be signed in to change notification settings - Fork 260
feat(supabase): add SupabaseGroongaDocumentStore and SupabaseGroongaRetriever #3266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
63e66be
28d58bc
b8fb275
1a238a5
c2ac6f6
494a433
17877e7
7ddf365
e8f53bb
446088c
5c8d270
ceb8394
3ae853c
5be352e
d679975
3887dff
78231ce
6d9ef28
22005a5
310aa13
95808fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,12 @@ | ||
| # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai> | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| from .embedding_retriever import SupabasePgvectorEmbeddingRetriever | ||
| from .groonga_retriever import SupabaseGroongaRetriever | ||
| from .keyword_retriever import SupabasePgvectorKeywordRetriever | ||
|
|
||
| __all__ = ["SupabasePgvectorEmbeddingRetriever", "SupabasePgvectorKeywordRetriever"] | ||
| __all__ = [ | ||
| "SupabaseGroongaRetriever", | ||
| "SupabasePgvectorEmbeddingRetriever", | ||
| "SupabasePgvectorKeywordRetriever", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai> | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import copy | ||
| from typing import Any | ||
|
|
||
| from haystack import component, default_from_dict, default_to_dict | ||
| from haystack.dataclasses import Document | ||
| from haystack.document_stores.types import FilterPolicy | ||
|
|
||
| from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore | ||
|
|
||
|
|
||
| @component | ||
| class SupabaseGroongaRetriever: | ||
| """ | ||
| Retrieves documents from SupabaseGroongaDocumentStore using PGroonga full-text search. | ||
|
|
||
| This retriever works without embeddings — it searches documents using plain text queries. | ||
| It can be used alongside SupabasePgvectorEmbeddingRetriever in hybrid search pipelines. | ||
|
|
||
| Note: async operations are not supported as the supabase-py sync client does not expose | ||
| awaitable query methods. Use the sync run() method instead. | ||
|
|
||
| Example usage: | ||
|
|
||
| ```python | ||
| from haystack_integrations.document_stores.supabase import SupabaseGroongaDocumentStore | ||
| from haystack_integrations.components.retrievers.supabase import SupabaseGroongaRetriever | ||
| from haystack.utils import Secret | ||
|
|
||
| document_store = SupabaseGroongaDocumentStore( | ||
| supabase_url="https://<project>.supabase.co", | ||
| supabase_key=Secret.from_env_var("SUPABASE_SERVICE_KEY"), | ||
| table_name="haystack_fts_documents", | ||
| ) | ||
| document_store.warm_up() | ||
|
|
||
| retriever = SupabaseGroongaRetriever(document_store=document_store, top_k=10) | ||
| result = retriever.run(query="python programming") | ||
| print(result["documents"]) | ||
| ``` | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| *, | ||
| document_store: SupabaseGroongaDocumentStore, | ||
| filters: dict[str, Any] | None = None, | ||
| top_k: int = 10, | ||
| filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, | ||
| ) -> None: | ||
| """ | ||
| Initialize the SupabaseGroongaRetriever. | ||
|
|
||
| :param document_store: An instance of SupabaseGroongaDocumentStore. | ||
| :param filters: Optional filters applied to retrieved Documents. | ||
| :param top_k: Maximum number of Documents to return. Defaults to 10. | ||
| :param filter_policy: Policy to determine how filters are applied. | ||
| :raises ValueError: If document_store is not an instance of SupabaseGroongaDocumentStore. | ||
| """ | ||
| if not isinstance(document_store, SupabaseGroongaDocumentStore): | ||
| msg = "document_store must be an instance of SupabaseGroongaDocumentStore" | ||
| raise ValueError(msg) | ||
|
|
||
| self.document_store = document_store | ||
| self.filters = filters or {} | ||
| self.top_k = top_k | ||
| self.filter_policy = ( | ||
| filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) | ||
| ) | ||
|
|
||
| @component.output_types(documents=list[Document]) | ||
| def run( | ||
| self, | ||
| query: str, | ||
| filters: dict[str, Any] | None = None, | ||
| top_k: int | None = None, | ||
| ) -> dict[str, list[Document]]: | ||
| """ | ||
| Runs the retriever on the given query. | ||
|
|
||
| :param query: The text query to search for. | ||
| :param filters: Optional runtime filters. Merged or replaced based on filter_policy. | ||
| :param top_k: Optional override for maximum number of documents to return. | ||
| :returns: Dictionary with key "documents" containing list of matching Documents. | ||
| """ | ||
| if not query: | ||
| return {"documents": []} | ||
|
|
||
| merged_filters = self._merge_filters(filters) | ||
| effective_top_k = top_k if top_k is not None else self.top_k | ||
|
|
||
| documents = self.document_store._groonga_retrieval( | ||
| query=query, | ||
| top_k=effective_top_k, | ||
| filters=merged_filters, | ||
| ) | ||
|
|
||
| return {"documents": documents} | ||
|
|
||
| @component.output_types(documents=list[Document]) | ||
| async def run_async( | ||
| self, | ||
| query: str, | ||
| filters: dict[str, Any] | None = None, | ||
| top_k: int | None = None, | ||
| ) -> dict[str, list[Document]]: | ||
| """ | ||
| Async version of run(). | ||
|
|
||
| Note: supabase-py's sync client does not support native async queries. | ||
| This method runs the synchronous retrieval and returns the result. | ||
| For fully async support, consider using acreate_client() from supabase-py | ||
| and refactoring the document store accordingly. | ||
|
|
||
| :param query: The text query to search for. | ||
| :param filters: Optional runtime filters. Merged or replaced based on filter_policy. | ||
| :param top_k: Optional override for maximum number of documents to return. | ||
| :returns: Dictionary with key "documents" containing list of matching Documents. | ||
| """ | ||
| return self.run(query=query, filters=filters, top_k=top_k) | ||
|
|
||
| def _merge_filters(self, runtime_filters: dict[str, Any] | None) -> dict[str, Any]: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a shallow dict merge, we should change it and use the Concrete example: What the current code produces: The "language == en" condition is dropped. What MERGE should produce: Check the The fix is to drop from haystack.document_stores.types.filter_policy import apply_filter_policy In run() / run_async(), replace: with: Also make sure this is tested, see the Mixin tests in haystack.testing.document_stores |
||
| """ | ||
| Merges runtime filters with init filters based on filter_policy. | ||
|
|
||
| :param runtime_filters: Filters passed at runtime. | ||
| :returns: Merged filters dictionary. | ||
| """ | ||
| if runtime_filters is not None: | ||
| if self.filter_policy == FilterPolicy.MERGE: | ||
| return {**self.filters, **runtime_filters} | ||
| return runtime_filters | ||
| return self.filters | ||
|
|
||
| def to_dict(self) -> dict[str, Any]: | ||
| """ | ||
| Serializes the component to a dictionary. | ||
|
|
||
| :returns: Dictionary with serialized data. | ||
| """ | ||
| return default_to_dict( | ||
| self, | ||
| filters=self.filters, | ||
| top_k=self.top_k, | ||
| filter_policy=self.filter_policy.value, | ||
| document_store=self.document_store.to_dict(), | ||
| ) | ||
|
|
||
| @classmethod | ||
| def from_dict(cls, data: dict[str, Any]) -> "SupabaseGroongaRetriever": | ||
| """ | ||
| Deserializes the component from a dictionary. | ||
|
|
||
| :param data: Dictionary to deserialize from. | ||
| :returns: Deserialized component. | ||
| """ | ||
| data = copy.deepcopy(data) | ||
| doc_store_params = data["init_parameters"]["document_store"] | ||
| data["init_parameters"]["document_store"] = SupabaseGroongaDocumentStore.from_dict(doc_store_params) | ||
| if filter_policy := data["init_parameters"].get("filter_policy"): | ||
| data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) | ||
| return default_from_dict(cls, data) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also define an
async def run_async(...)? Does the client supports it?