+
Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions lib/sycamore/sycamore/connectors/opensearch/opensearch_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
from copy import deepcopy

Expand Down Expand Up @@ -288,7 +289,7 @@ def __init__(
logger.info(f"OpenSearchReader using PIT: {self.use_pit}")

@timetrace("OpenSearchReader")
def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
def _to_parent_doc(self, doc: dict[str, Any]) -> List[dict[str, Any]]:
"""
Get all parent documents from a given slice.
"""
Expand All @@ -304,6 +305,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
raise ValueError("Target is not present\n" f"Parameters: {self._query_params}\n")

os_client = client._client
slice_query = json.loads(doc["doc"])

assert (
get_doc_count_for_slice(os_client, slice_query) < 10000
Expand Down Expand Up @@ -341,7 +343,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
logger.info(f"Read {len(results)} documents from {self._query_params.index_name}")

except Exception as e:
raise ValueError(f"Error reading from target: {e}")
raise ValueError(f"Error reading from target: {e}, query: {slice_query}")
finally:
if client is not None:
client.close()
Expand All @@ -350,7 +352,7 @@ def _to_parent_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
return ret

@timetrace("OpenSearchReader")
def _to_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
def _to_doc(self, doc: dict[str, Any]) -> List[dict[str, Any]]:
"""
Get all documents from a given slice.
"""
Expand All @@ -368,6 +370,7 @@ def _to_doc(self, slice_query: dict[str, Any]) -> List[dict[str, Any]]:
raise ValueError("Target is not present\n" f"Parameters: {self._query_params}\n")

os_client = client._client
slice_query = json.loads(doc["doc"])
slice_count = get_doc_count_for_slice(os_client, slice_query)
assert slice_count <= 10000, f"Slice count ({slice_count}) should return <= 10,000 documents"

Expand Down Expand Up @@ -547,7 +550,8 @@ def _execute_pit(self, **kwargs) -> "Dataset":
}
if "query" in query:
_query["query"] = query["query"]
docs.append(_query)

docs.append({"doc": json.dumps(_query)})
logger.debug(f"Added slice {i} to the query {_query}")
except Exception as e:
raise ValueError(f"Error reading from target: {e}")
Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载