+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
034c595
Add `web_serialize`
MarkLindblad Jul 6, 2025
a1c9448
Change `.aryn.write(...)` to use `web_serialize` instead
MarkLindblad Jul 6, 2025
d1070ff
Add `@experimental` decorators to web serialization methods
MarkLindblad Jul 6, 2025
9f9a8cc
Fix mypy linting
MarkLindblad Jul 6, 2025
b570ad7
Fix black linting
MarkLindblad Jul 9, 2025
9c9e348
Switch to using `msgpack`
MarkLindblad Jul 9, 2025
5bf5795
Make more concise
MarkLindblad Jul 9, 2025
c9c1173
Fix ruff, black linting
MarkLindblad Jul 9, 2025
1439d08
Make `web_deserialize` more forward-compatible
MarkLindblad Jul 9, 2025
8fc848f
Mark web serialization as experimental, only allow serialization of b…
MarkLindblad Jul 9, 2025
185be6a
Merge branch 'main' into mark/serialize
MarkLindblad Jul 9, 2025
7ed3812
Avoid double `type` call
MarkLindblad Jul 9, 2025
2055406
Fix linting
MarkLindblad Jul 9, 2025
47031fe
Remove enuemeration left in
MarkLindblad Jul 9, 2025
49f413d
Add magic bytes, version header, and zero padding
MarkLindblad Jul 9, 2025
a9227c4
Move helper functions to file scope
MarkLindblad Jul 9, 2025
99094e0
Change to use `aryn_element_type` instead of `_kind`
MarkLindblad Jul 9, 2025
c522165
Switch to streaming approach, add tests
MarkLindblad Jul 10, 2025
29f1c50
Switch warning message from `logging` to `warnings`
MarkLindblad Jul 10, 2025
eac53a0
Merge branch 'main' into mark/serialize
MarkLindblad Jul 15, 2025
202fc28
Change usage of `struct`
MarkLindblad Jul 18, 2025
c4ec0d3
Add terminator to end of web_serialized documents, add test, fix `tes…
MarkLindblad Jul 18, 2025
eb450e5
Make use of more compact `struct.pack` representation
MarkLindblad Jul 18, 2025
8c629ca
Rename `aryn_element_type` to `_aryn_element_type`
MarkLindblad Jul 18, 2025
d8845b5
Change `ValueError` to `RuntimeError` on Element web_serializization …
MarkLindblad Jul 18, 2025
a3f71d9
Rename `file` to `stream`
MarkLindblad Jul 18, 2025
ab96317
Remove unused `_reconstruct`
MarkLindblad Jul 18, 2025
18e9ce5
Fix ruff linting
MarkLindblad Jul 19, 2025
febf289
Make `"!8s2H4x"` a global variable
MarkLindblad Jul 21, 2025
342832e
Rename web serialization global variables
MarkLindblad Jul 21, 2025
c5759a1
Change unexpected end of file error message and type
MarkLindblad Jul 21, 2025
b226842
Fix message checked in unit test for unexpected end of stream
MarkLindblad Jul 21, 2025
e90cd74
Rename confusing variable named `read` to `got`
MarkLindblad Jul 21, 2025
88a30d2
Change `if len(to_add) == 0:` to `if not to_add:`
MarkLindblad Jul 21, 2025
01c619d
Change `readmin` to `read_header`
MarkLindblad Jul 21, 2025
99a1102
Raise error on bad header earlier
MarkLindblad Jul 21, 2025
ac7f1d1
Change error message on bad magic number
MarkLindblad Jul 21, 2025
051f89d
Remove dead code
MarkLindblad Jul 21, 2025
1d6a8c7
Change web serialization errors from ValueErrors to RuntimeErrors
MarkLindblad Jul 21, 2025
662cf08
Fix black linting
MarkLindblad Jul 21, 2025
4ea302e
Fix error type checked for in web serialization unit tests
MarkLindblad Jul 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 62 additions & 79 deletions lib/sycamore/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/sycamore/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ nltk = { version = "^3.9.0", optional = true }
anthropic = { version = "^0.42.0", optional = true }
google-genai = {version = "^1.14.0", optional = true}
tiktoken = "^0.8.0"
msgpack = "^1.1.1"

[tool.poetry.group.test.dependencies]
flake8 = "4.0.1"
Expand Down
2 changes: 1 addition & 1 deletion lib/sycamore/sycamore/connectors/aryn/ArynWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def write_many_records(self, records: list["BaseDBWriter.Record"], target_params
for record in records:
assert isinstance(record, ArynWriterRecord)
doc = record.doc
files: Mapping = {"doc": doc.serialize()}
files: Mapping = {"doc": doc.web_serialize()}
sess.post(
url=f"{self.aryn_url}/docsets/write",
params={"docset_id": docset_id, "update_schema": update_schema},
Expand Down
77 changes: 76 additions & 1 deletion lib/sycamore/sycamore/data/document.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from collections import UserDict
import json
from typing import Any, Optional
from typing import Any, Optional, BinaryIO
import struct

import msgpack

from sycamore.data import BoundingBox, Element
from sycamore.data.element import create_element
from sycamore.data.docid import mkdocid, nanoid36
from sycamore.decorators import experimental

DOCUMENT_WEB_SERIALIZATION_MAGIC = b"ArynSDoc"
DOCUMENT_WEB_SERIALIZATION_VERSION_MAJOR = 0
DOCUMENT_WEB_SERIALIZATION_VERSION_MINOR = 1
DOCUMENT_WEB_SERIALIZATION_HEADER_FORMAT = "!8s2H4x"


class DocumentSource:
Expand Down Expand Up @@ -202,6 +211,72 @@ def deserialize(raw: bytes) -> "Document":
else:
return Document(data)

@experimental
def web_serialize(self, stream: BinaryIO) -> None:
kind = type(self)
if kind != Document: # MetadataDocument, HierarchicalDocument, SummaryDocument are not yet supported
raise NotImplementedError(f"web_serialize cannot yet handle type '{kind.__name__}'")

stream.write(
struct.pack(
DOCUMENT_WEB_SERIALIZATION_HEADER_FORMAT,
DOCUMENT_WEB_SERIALIZATION_MAGIC,
DOCUMENT_WEB_SERIALIZATION_VERSION_MAJOR,
DOCUMENT_WEB_SERIALIZATION_VERSION_MINOR,
)
)

elementless_data = self.data.copy() # Shallow copy
del elementless_data["elements"]

packed_elementless_data = msgpack.packb(elementless_data)
if not packed_elementless_data:
raise RuntimeError("Failed to serialize document")
stream.write(packed_elementless_data)

for element in self.elements:
element.web_serialize(stream)
msgpack.pack("_TERMINATOR", stream)

@experimental
@staticmethod
def web_deserialize(stream: BinaryIO) -> "Document":
def read_header(stream: BinaryIO):
header_size = struct.calcsize(DOCUMENT_WEB_SERIALIZATION_HEADER_FORMAT)
data = bytearray()
got = 0
while got < header_size:
to_add = stream.read(header_size - got)
if not to_add:
raise RuntimeError("Failed to read document header")
data.extend(to_add)
got += len(to_add)
return data

header = read_header(stream)
magic_bytes, version_major, version_minor = struct.unpack(DOCUMENT_WEB_SERIALIZATION_HEADER_FORMAT, header)
if magic_bytes != DOCUMENT_WEB_SERIALIZATION_MAGIC:
raise RuntimeError("Input does not appear to be an Aryn serialized document (Bad magic number).")
if (
version_major != DOCUMENT_WEB_SERIALIZATION_VERSION_MAJOR
or version_minor != DOCUMENT_WEB_SERIALIZATION_VERSION_MINOR
):
raise RuntimeError(f"Unsupported serialization version: {version_major}.{version_minor}")

unpacker = msgpack.Unpacker(stream)
elementless_data = next(unpacker)
doc = Document(elementless_data)
elements = doc.elements
saw_terminator = False
for obj in unpacker:
if obj == "_TERMINATOR":
saw_terminator = True
break
elements.append(Element.web_deserialize(obj))
if not saw_terminator:
raise RuntimeError("Premature end of serialized document stream.")
return doc

@staticmethod
def from_row(row: dict[str, bytes]) -> "Document":
"""Unserialize a Ray row back into a Document."""
Expand Down
27 changes: 26 additions & 1 deletion lib/sycamore/sycamore/data/element.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from collections import UserDict
from io import BytesIO
import json
from typing import Any, Optional
from typing import Any, Optional, BinaryIO

from PIL import Image
import msgpack

from sycamore.data.bbox import BoundingBox
from sycamore.data.table import Table
from sycamore.decorators import experimental


class Element(UserDict):
Expand Down Expand Up @@ -125,6 +127,18 @@ def __gt__(self, other) -> bool:
oidx = -1 if other.element_index is None else other.element_index
return sidx > oidx

@experimental
def web_serialize(self, file: BinaryIO) -> None:
if bits := msgpack.packb(self.data):
file.write(bits)
else:
raise RuntimeError("Failed to serialize element")

@experimental
@staticmethod
def web_deserialize(obj: dict[str, Any]) -> "Element":
return create_element(**obj)


class ImageElement(Element):
def __init__(
Expand Down Expand Up @@ -257,6 +271,17 @@ def text_representation(self) -> Optional[str]:
def text_representation(self, text_representation: str) -> None:
self.data["text_representation"] = text_representation

@experimental
def web_serialize(self, file: BinaryIO) -> None:
payload = self.data
if table := payload.get("table"):
payload = payload.copy()
payload["table"] = table.to_dict()
if bits := msgpack.packb(payload):
file.write(bits)
else:
raise ValueError("Failed to serialize element")


def create_element(element_index: Optional[int] = None, **kwargs) -> Element:
element: Element
Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载