+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions lib/sycamore/sycamore/data/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ def properties(self, properties: dict[str, Any]):
def properties(self) -> None:
self.data["properties"] = {}

@property
def embedding(self) -> Optional[list[float]]:
"""Get the embedding for this element."""
return self.data.get("embedding")

@embedding.setter
def embedding(self, embedding: list[float]) -> None:
"""Set the embedding for this element."""
self.data["embedding"] = embedding

def __str__(self) -> str:
"""Return a pretty-printed string representing this Element."""
d = {
Expand All @@ -84,6 +94,7 @@ def __str__(self) -> str:
"binary_representation": (
f"<{len(self.binary_representation)} bytes>" if self.binary_representation else None
),
"embedding": (str(self.embedding[0:4]) + f"... <{len(self.embedding)} total>") if self.embedding else None,
"bbox": str(self.bbox),
"properties": {k: str(v) for k, v in self.properties.items()},
}
Expand Down
12 changes: 9 additions & 3 deletions lib/sycamore/sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ def _truncate(s):
amount_truncated = len(s) - truncate_length
return s[:truncate_length] + f" <{amount_truncated} chars>"

def _format_embedding(embedding):
"""Format the embedding to display its length."""
if embedding is None:
return None
return f"<{len(embedding)} floats>"

for document in documents:
if not show_elements:
num_elems = len(document.elements)
Expand All @@ -112,8 +118,7 @@ def _truncate(s):
document.text_representation = _truncate(document.text_representation)

if not show_embedding and document.embedding is not None:
embedding_length = len(document.embedding)
document.data["embedding"] = f"<{embedding_length} floats>"
document.data["embedding"] = _format_embedding(document.embedding)

if show_elements and "elements" in document.data:
if not show_binary:
Expand All @@ -125,7 +130,8 @@ def _truncate(s):
for i, e in enumerate(document.data["elements"]):
if e.get("text_representation") is not None:
e["text_representation"] = _truncate(e["text_representation"])

if e.get("embedding") is not None:
e.data["embedding"] = _format_embedding(e.embedding)
pprint.pp(document, stream=stream)

def count(self, include_metadata=False, **kwargs) -> int:
Expand Down
72 changes: 33 additions & 39 deletions lib/sycamore/sycamore/tests/integration/transforms/test_embed.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import sycamore
from sycamore.data import Document
from sycamore.data import Document, Element
from sycamore.transforms.embed import Embedder, BedrockEmbedder, OpenAIEmbedder, SentenceTransformerEmbedder

passages = [
Expand All @@ -24,14 +23,14 @@
]


def check_embedder(embedder: Embedder, expected_dim: int):
def check_embedder(embedder: Embedder, expected_dim: int, use_documents: bool = False, use_elements: bool = False):
docs = [
Document(
{
"doc_id": f"doc_{i}",
"type": "test",
"text_representation": passage,
"elements": [],
"text_representation": passage if use_documents else None,
"elements": [Element({"_element_index": 0, "text_representation": passage})] if use_elements else [],
"properties": {},
}
)
Expand All @@ -42,48 +41,43 @@ def check_embedder(embedder: Embedder, expected_dim: int):
assert len(new_docs) == len(docs)

for doc in new_docs:
if doc.text_representation != "":
assert doc.embedding is not None
assert len(doc.embedding) == expected_dim
if use_documents:
if doc.text_representation != "":
assert doc.embedding is not None
assert len(doc.embedding) == expected_dim
else:
assert doc.embedding is None
if use_elements:
for element in doc.elements:
if element.text_representation != "":
assert element.embedding is not None
assert len(element.embedding) == expected_dim
else:
assert element.embedding is None


def test_sentencetransformer_embedding():
check_embedder(
embedder=SentenceTransformerEmbedder(model_name="thenlper/gte-small", batch_size=100), expected_dim=384
embedder=SentenceTransformerEmbedder(model_name="thenlper/gte-small", batch_size=100),
expected_dim=384,
use_documents=True,
)
check_embedder(
embedder=SentenceTransformerEmbedder(model_name="thenlper/gte-small", batch_size=100),
expected_dim=384,
use_elements=True,
)
check_embedder(
embedder=SentenceTransformerEmbedder(model_name="thenlper/gte-small", batch_size=100),
expected_dim=384,
use_documents=True,
use_elements=True,
)


def test_openai_embedding():
check_embedder(embedder=OpenAIEmbedder(), expected_dim=1536)
check_embedder(embedder=OpenAIEmbedder(), expected_dim=1536, use_elements=True, use_documents=True)


def test_bedrock_embedding():
check_embedder(embedder=BedrockEmbedder(), expected_dim=1536)


def test_openai_embedding_batches():
docs = [
Document(
{
"doc_id": f"doc_{i}",
"type": "test",
"text_representation": f"Document text for passage {i}",
"elements": [],
"properties": {},
}
)
for i in range(5)
]

context = sycamore.init()
doc_set = context.read.document(docs)

embedder = SentenceTransformerEmbedder(model_name="thenlper/gte-small", batch_size=100)
embedded_doc_set = doc_set.embed(embedder=embedder) # OpenAIEmbedder(model_batch_size=3))

new_docs = embedded_doc_set.take()

assert len(new_docs) == len(docs)

for doc in new_docs:
assert len(doc.embedding) == 1536
check_embedder(embedder=BedrockEmbedder(), expected_dim=1536, use_elements=True, use_documents=True)
102 changes: 88 additions & 14 deletions lib/sycamore/sycamore/tests/unit/transforms/test_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,38 @@
import pytest
import ray.data

from sycamore.data import Document
from sycamore.data import Document, Element
from sycamore.plan_nodes import Node
from sycamore.transforms import Embed
from sycamore.transforms.embed import OpenAIEmbedder, SentenceTransformerEmbedder
from sycamore.transforms.embed import OpenAIEmbedder, SentenceTransformerEmbedder, BedrockEmbedder


class TestEmbedding:

def check_sentence_transformer(
self, model_name, dimension, texts, use_documents: bool = False, use_elements: bool = False
):
input_batch = []
for text in texts:
doc = Document()
if use_documents:
doc.text_representation = text
if use_elements:
element = Element()
element.text_representation = text
doc.elements = [element]
input_batch.append(doc)
embedder = SentenceTransformerEmbedder(model_name)
output_batch = embedder(doc_batch=input_batch)
for doc in output_batch:
if use_documents:
assert doc.embedding is not None
assert len(doc.embedding) == dimension
if use_elements:
for element in doc.elements:
assert element.embedding is not None
assert len(element.embedding) == dimension

"""Test data is sampled from different captions for the same image from
the Flickr30k dataset"""

Expand Down Expand Up @@ -52,25 +77,33 @@ class TestEmbedding:
],
)
def test_sentence_transformer(self, model_name, dimension, texts):
input_batch = []
for text in texts:
doc = Document()
doc.text_representation = text
input_batch.append(doc)
embedder = SentenceTransformerEmbedder(model_name)
output_batch = embedder(doc_batch=input_batch)
for doc in output_batch:
assert len(doc.embedding) == dimension
self.check_sentence_transformer(model_name, dimension, texts, use_documents=True, use_elements=True)
self.check_sentence_transformer(model_name, dimension, texts, use_elements=True)
self.check_sentence_transformer(model_name, dimension, texts, use_documents=True)

def test_sentence_transformer_embedding(self, mocker):
def check_sentence_transformer_embedding(self, mocker, use_documents: bool = False, use_elements: bool = False):
node = mocker.Mock(spec=Node)
embedding = Embed(
node,
embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100),
)
texts = ["Members of a strike at Yale University.", "A woman is speaking at a podium outdoors."]
elements = [
{"_element_index": 1, "text_representation": texts[0], "embedding": None},
{
"_element_index": 2,
"text_representation": texts[1],
"embedding": None,
},
]
dicts = [
{"doc_id": 1, "text_representation": "Members of a strike at Yale University.", "embedding": None},
{"doc_id": 2, "text_representation": "A woman is speaking at a podium outdoors.", "embedding": None},
{
"doc_id": 1,
"text_representation": texts[0] if use_documents else None,
"embedding": None,
"elements": elements if use_elements else [],
},
{"doc_id": 2, "text_representation": texts[1] if use_documents else None, "embedding": None},
]
input_dataset = ray.data.from_items([{"doc": Document(dict).serialize()} for dict in dicts])
execute = mocker.patch.object(node, "execute")
Expand All @@ -79,9 +112,50 @@ def test_sentence_transformer_embedding(self, mocker):
output_dataset = embedding.execute()
output_dataset.show()

def test_sentence_transformer_embedding(self, mocker):
self.check_sentence_transformer_embedding(mocker, use_documents=True, use_elements=True)
self.check_sentence_transformer_embedding(mocker, use_elements=True)
self.check_sentence_transformer_embedding(mocker, use_documents=True)

def test_openai_embedder_pickle(self):
obj = OpenAIEmbedder()
obj._client = obj.client_wrapper.get_client()

pickle.dumps(obj)
assert True

def test_sentence_transformer_batch_size(self):
embedder = SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2")
assert embedder.model_batch_size == 100

embedder = SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", model_batch_size=50)
assert embedder.model_batch_size == 50

embedder = BedrockEmbedder(model_batch_size=100)
assert embedder.model_batch_size == 1

embedder = BedrockEmbedder(model_batch_size=1)
assert embedder.model_batch_size == 1

embedder = OpenAIEmbedder(model_batch_size=120)
assert embedder.model_batch_size == 120

# Test batching
texts = ["text1", "text2", "text3", "text4"]
docs = [Document({"text_representation": t}) for t in texts]

embedders = [
SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", model_batch_size=2),
OpenAIEmbedder(model_batch_size=2),
]
for embedder in embedders:
original_embed_texts = embedder.embed_texts

def mock_embed_texts(text_batch):
assert len(text_batch) == 2, "All batches should be size 2"
return original_embed_texts(text_batch)

embedder.embed_texts = mock_embed_texts

embedded_docs = embedder(docs)
assert len(embedded_docs) == len(texts), "All texts should be processed"
Loading
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载