+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lib/sycamore/sycamore/data/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ def field_to_value(self, field: str) -> Any:

return dotted_lookup(self, field)

def __lt__(self, other) -> bool:
sidx = -1 if self.element_index is None else self.element_index
oidx = -1 if other.element_index is None else other.element_index
return sidx < oidx

def __gt__(self, other) -> bool:
sidx = -1 if self.element_index is None else self.element_index
oidx = -1 if other.element_index is None else other.element_index
return sidx > oidx


class ImageElement(Element):
def __init__(
Expand Down
26 changes: 1 addition & 25 deletions lib/sycamore/sycamore/tests/unit/transforms/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
import pytest
from ray.data import Dataset

from sycamore.data import Document, Element
from sycamore.data import Document
from sycamore.transforms.partition import (
Partition,
HtmlPartitioner,
UnstructuredPdfPartitioner,
UnstructuredPPTXPartitioner,
SycamorePartitioner,
)
from sycamore.utils.bbox_sort import bbox_sorted_elements
from sycamore.connectors.file import BinaryScan
from sycamore.tests.config import TEST_DIR

Expand Down Expand Up @@ -156,29 +155,6 @@ def test_deformable_detr_partition(self, mocker, path, partition_count) -> None:
doc = Document.from_row(docset.take(limit=1)[0])
assert len(doc.elements) == partition_count

def test_sycamore_partitioner_elements_reorder(self) -> None:
# e1.y1 < e0.y1 = e2.y1, e0.x1 < e2.x1 both on left
e0 = Element({"bbox": (0.20, 0.50, 0.45, 0.70), "properties": {"page_number": 3}})
e1 = Element({"bbox": (0.20, 0.21, 0.45, 0.41), "properties": {"page_number": 3}})
e2 = Element({"bbox": (0.51, 0.50, 0.90, 0.70), "properties": {"page_number": 3}})

# e4, e5 in left column, e4.y < e5.y1; e3, e6 in right columns, e3.y1 < e6.y1
e3 = Element({"bbox": (0.52, 0.21, 0.90, 0.45), "properties": {"page_number": 1}})
e4 = Element({"bbox": (0.10, 0.21, 0.48, 0.46), "properties": {"page_number": 1}})
e5 = Element({"bbox": (0.10, 0.58, 0.48, 0.90), "properties": {"page_number": 1}})
e6 = Element({"bbox": (0.58, 0.51, 0.90, 0.85), "properties": {"page_number": 1}})

# all the same, test stable
e7 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}})
e8 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}})
e9 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}})

elements = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9]
elements = bbox_sorted_elements(elements)
result = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2]

assert elements == result

@pytest.mark.skip(
reason="Breaks as of 2024-10-18. See https://github.com/aryn-ai/sycamore/actions/runs/11411766096"
)
Expand Down
14 changes: 4 additions & 10 deletions lib/sycamore/sycamore/tests/unit/utils/test_bbox_sort.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
from typing import Any, Optional

from sycamore.data import Document, Element
from sycamore.utils.bbox_sort import (
collect_pages,
col_tag,
find_overlap,
bbox_sorted_elements,
bbox_sort_page,
bbox_sort_document,
)
from sycamore.utils.bbox_sort import col_tag, find_overlap, bbox_sort_page
from sycamore.utils.element_sort import collect_pages, sort_elements, sort_document


def mkElem(
Expand Down Expand Up @@ -99,7 +93,7 @@ def test_elements_basic() -> None:
e9 = mkElem(0.20, 0.21, 0.90, 0.41, 2)

elems = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9]
elems = bbox_sorted_elements(elems)
sort_elements(elems)
answer = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2]
assert elems == answer
assert_element_index_sorted(elems)
Expand All @@ -114,7 +108,7 @@ def test_document_basic() -> None:
e5 = mkElem(0.1, 0.1, 0.9, 0.2, 2)
doc = Document()
doc.elements = [e0, e1, e2, e3, e4, e5]
bbox_sort_document(doc)
sort_document(doc, mode="bbox")
answer = [e3, e2, e5, e4, e1, e0]
assert doc.elements == answer
assert_element_index_sorted(doc.elements)
Expand Down
159 changes: 159 additions & 0 deletions lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from typing import Any, Optional

from sycamore.data import Document, Element
from sycamore.utils.xycut import xycut_sort_page
from sycamore.utils.element_sort import sort_elements, sort_document


def mkElem(
left: float, top: float, right: float, bot: float, page: Optional[int] = None, type: str = "Text"
) -> Element:
d: dict[str, Any] = {"bbox": (left, top, right, bot), "type": type}
if page is not None:
d["properties"] = {"page_number": page}
return Element(d)


def test_page_basic() -> None:
# e1, e2 in left column, e1.top < e2.top
# e0, e3 in right column, e0.top < e3.top
# e4 full width, at top
e0 = mkElem(0.59, 0.25, 0.90, 0.60)
e1 = mkElem(0.10, 0.26, 0.40, 0.51)
e2 = mkElem(0.10, 0.58, 0.40, 0.90)
e3 = mkElem(0.60, 0.65, 0.90, 0.85)
e4 = mkElem(0.15, 0.10, 0.85, 0.15)
elems = [e0, e1, e2, e3, e4]
xycut_sort_page(elems)
answer = [e4, e1, e2, e0, e3]
assert elems == answer


def test_elements_basic() -> None:
# e1.top < e0.top = e2.top, e0.left < e2.left both on left
e0 = mkElem(0.20, 0.50, 0.45, 0.70, 3)
e1 = mkElem(0.20, 0.21, 0.45, 0.41, 3)
e2 = mkElem(0.51, 0.50, 0.90, 0.70, 3)

# e4, e5 in left column, e4.top < e5.top
# e3, e6 in right column, e3.top < e6.top
e3 = mkElem(0.59, 0.25, 0.90, 0.60, 1)
e4 = mkElem(0.10, 0.26, 0.40, 0.51, 1)
e5 = mkElem(0.10, 0.58, 0.40, 0.90, 1)
e6 = mkElem(0.60, 0.65, 0.90, 0.85, 1)

# all the same, test stable
e7 = mkElem(0.20, 0.21, 0.90, 0.41, 2)
e8 = mkElem(0.20, 0.21, 0.90, 0.41, 2)
e9 = mkElem(0.20, 0.21, 0.90, 0.41, 2)

elems = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9]
sort_elements(elems, mode="xycut")
answer = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2]
assert elems == answer
assert_element_index_sorted(elems)


def test_document_basic() -> None:
e0 = mkElem(0.1, 0.5, 0.9, 0.6, 3)
e1 = mkElem(0.1, 0.1, 0.9, 0.2, 3)
e2 = mkElem(0.1, 0.5, 0.9, 0.6, 1)
e3 = mkElem(0.1, 0.1, 0.9, 0.2, 1)
e4 = mkElem(0.1, 0.5, 0.9, 0.6, 2)
e5 = mkElem(0.1, 0.1, 0.9, 0.2, 2)
doc = Document()
doc.elements = [e0, e1, e2, e3, e4, e5]
sort_document(doc, mode=xycut_sort_page)
answer = [e3, e2, e5, e4, e1, e0]
assert doc.elements == answer
assert_element_index_sorted(doc.elements)


def test_page_footer() -> None:
# e1, e2 in left column, e1.top < e2.top
# e0, e3 in right column, e0.top < e3.top
# e4 full width, at top
# e5 in left column, but page-footer
e0 = mkElem(0.59, 0.25, 0.90, 0.60)
e1 = mkElem(0.10, 0.26, 0.40, 0.51)
e2 = mkElem(0.10, 0.58, 0.40, 0.90)
e3 = mkElem(0.60, 0.65, 0.90, 0.85)
e4 = mkElem(0.15, 0.10, 0.85, 0.15)
e5 = mkElem(0.25, 0.95, 0.75, 1.0, type="Page-footer")
elems = [e0, e1, e2, e3, e4, e5]
xycut_sort_page(elems)
answer = [e4, e1, e2, e0, e3, e5]
assert elems == answer


def test_no_cut() -> None:
e0 = mkElem(0.40, 0.70, 0.90, 0.90)
e1 = mkElem(0.10, 0.40, 0.30, 0.90)
e2 = mkElem(0.70, 0.10, 0.90, 0.60)
e3 = mkElem(0.10, 0.10, 0.60, 0.30)
elems = [e0, e1, e2, e3]
xycut_sort_page(elems)
answer = [e3, e1, e0, e2] # what bbox_sort gives
assert elems == answer


# bbox coordinates and reading order from page 9 of
# https://www.aemps.gob.es/medicamentosUsoHumano/informesPublicos/docs/IPT-viekirax-exviera.pdf
g_viekirax_boxes = [
(0.9159, 0.0231, 0.9825, 0.1116, 0),
(0.5336, 0.1245, 0.9489, 0.1612, 15),
(0.0951, 0.1245, 0.5205, 0.1486, 1),
(0.0945, 0.1524, 0.5202, 0.3006, 2),
(0.5339, 0.1686, 0.9478, 0.2051, 16),
(0.5340, 0.2126, 0.9529, 0.2492, 17),
(0.5335, 0.2565, 0.8968, 0.2808, 18),
(0.5571, 0.2820, 0.9482, 0.3055, 19),
(0.0945, 0.3046, 0.5198, 0.3655, 3),
(0.5336, 0.3129, 0.8991, 0.3371, 20),
(0.5572, 0.3384, 0.9484, 0.3619, 21),
(0.5332, 0.3689, 0.8977, 0.3932, 22),
(0.0945, 0.3693, 0.5180, 0.3938, 4),
(0.5574, 0.3943, 0.9482, 0.4182, 23),
(0.0947, 0.3974, 0.5187, 0.4221, 5),
(0.5325, 0.4255, 0.9324, 0.4497, 24),
(0.0950, 0.4258, 0.5195, 0.4499, 6),
(0.5324, 0.4575, 0.7744, 0.4693, 25),
(0.0946, 0.4709, 0.2071, 0.4842, 7),
(0.0948, 0.4911, 0.5152, 0.5287, 8),
(0.0945, 0.5355, 0.5029, 0.5725, 9),
(0.0946, 0.5793, 0.4987, 0.6289, 10),
(0.0939, 0.6359, 0.5168, 0.7223, 11),
(0.0948, 0.7290, 0.5058, 0.7785, 12),
(0.0947, 0.7851, 0.5160, 0.8472, 13),
(0.0946, 0.8544, 0.5148, 0.8913, 14),
(0.5568, 0.4700, 0.9550, 0.4941, 26),
(0.5334, 0.5016, 0.7799, 0.5134, 27),
(0.5556, 0.5140, 0.9545, 0.5384, 28),
(0.5324, 0.5452, 0.8140, 0.5575, 29),
(0.5566, 0.5581, 0.9507, 0.5820, 30),
(0.5323, 0.5894, 0.8261, 0.6018, 31),
(0.5564, 0.6020, 0.9550, 0.6265, 32),
(0.5330, 0.6336, 0.9540, 0.6951, 33),
(0.5323, 0.7021, 0.9566, 0.7641, 34),
(0.5323, 0.7706, 0.9559, 0.8326, 35),
(0.5321, 0.8392, 0.9508, 0.9140, 36),
(0.4780, 0.9469, 0.5713, 0.9590, 37),
]


def test_viekirax() -> None:
elems: list[Element] = []
for tup in g_viekirax_boxes:
elem = mkElem(tup[0], tup[1], tup[2], tup[3])
elem.text_representation = str(tup[4])
elems.append(elem)
xycut_sort_page(elems)
for ii, elem in enumerate(elems):
s = str(ii)
assert elem.text_representation == s


def assert_element_index_sorted(elements: list[Element]):
assert all(
elements[i].element_index < elements[i + 1].element_index for i in range(len(elements) - 1) # type: ignore
)
5 changes: 3 additions & 2 deletions lib/sycamore/sycamore/transforms/detr_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from sycamore.data.element import create_element
from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR
from sycamore.utils import choose_device
from sycamore.utils.bbox_sort import bbox_sort_page
from sycamore.utils.element_sort import sort_page
from sycamore.utils.cache import Cache
from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes
from sycamore.utils.import_utils import requires_modules
Expand Down Expand Up @@ -172,6 +172,7 @@ def partition_pdf(
text_extraction_options: dict[str, Any] = {},
source: str = "",
output_label_options: dict[str, Any] = {},
sort_mode: Optional[str] = None,
**kwargs,
) -> list[Element]:
if use_partitioning_service:
Expand Down Expand Up @@ -223,7 +224,7 @@ def partition_pdf(
promote_title(page, title_candidate_elements)
else:
promote_title(page)
bbox_sort_page(page)
sort_page(page, mode=sort_mode)
elements.extend(page)
if output_format == "markdown":
md = elements_to_markdown(elements)
Expand Down
6 changes: 4 additions & 2 deletions lib/sycamore/sycamore/transforms/merge_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sycamore.utils.merge_utils import combine_strs_min_newline
from sycamore.transforms.llm_query import LLMTextQueryAgent
from sycamore.llms import LLM
from sycamore.utils.bbox_sort import bbox_sort_document
from sycamore.utils.element_sort import sort_document


class ElementMerger(ABC):
Expand Down Expand Up @@ -463,12 +463,14 @@ def __init__(
regex_pattern: Optional[Pattern] = None,
llm_prompt: Optional[str] = None,
llm: Optional[LLM] = None,
sort_mode: Optional[str] = None,
*args,
**kwargs,
):
self.regex_pattern = regex_pattern
self.llm_prompt = llm_prompt
self.llm = llm
self.sort_mode = sort_mode
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.element_sort = ELEMENT_SORT[sort_mode]


def merge_elements(self, document: Document) -> Document:

Expand All @@ -492,7 +494,7 @@ def merge_elements(self, document: Document) -> Document:
new_table_elements[-1]["properties"]["table_continuation"] = False
other_elements.extend(new_table_elements)
document.elements = other_elements
bbox_sort_document(document)
sort_document(document, mode=self.sort_mode)

return document

Expand Down
12 changes: 9 additions & 3 deletions lib/sycamore/sycamore/transforms/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sycamore.utils.time_trace import timetrace
from sycamore.utils import choose_device
from sycamore.utils.aryn_config import ArynConfig
from sycamore.utils.bbox_sort import bbox_sort_document
from sycamore.utils.element_sort import sort_document

from sycamore.transforms.detr_partitioner_config import (
ARYN_DETR_MODEL,
Expand Down Expand Up @@ -164,6 +164,7 @@ def __init__(
min_partition_length: Optional[int] = 500,
include_metadata: bool = True,
retain_coordinates: bool = False,
sort_mode: Optional[str] = None,
):
super().__init__(device="cpu")
self._include_page_breaks = include_page_breaks
Expand All @@ -174,6 +175,7 @@ def __init__(
self._min_partition_length = min_partition_length
self._include_metadata = include_metadata
self._retain_coordinates = retain_coordinates
self._sort_mode = sort_mode

@staticmethod
def to_element(dict: dict[str, Any], element_index: Optional[int] = None, retain_coordinates=False) -> Element:
Expand Down Expand Up @@ -233,7 +235,7 @@ def partition(self, document: Document) -> Document:
]
del elements

bbox_sort_document(document)
sort_document(document, mode=self._sort_mode)
return document


Expand Down Expand Up @@ -429,6 +431,7 @@ class ArynPartitioner(Partitioner):
Here is an example set of output label options:
{"promote_title": True, "title_candidate_elements": ["Section-header", "Caption"]}
default: None (no element is promoted to "Title")
sort_mode: Reading order algorithm: bbox (default) or xycut.
kwargs: Additional keyword arguments to pass to the remote partitioner.
Example:
The following shows an example of using the ArynPartitioner to partition a PDF and extract
Expand Down Expand Up @@ -466,6 +469,7 @@ def __init__(
text_extraction_options: dict[str, Any] = {},
source: str = "",
output_label_options: dict[str, Any] = {},
sort_mode: Optional[str] = None,
**kwargs,
):
if use_partitioning_service:
Expand Down Expand Up @@ -508,6 +512,7 @@ def __init__(
self._text_extraction_options = text_extraction_options
self._source = source
self.output_label_options = output_label_options
self.sort_mode = sort_mode
self._kwargs = kwargs

@timetrace("SycamorePdf")
Expand Down Expand Up @@ -539,6 +544,7 @@ def partition(self, document: Document) -> Document:
text_extraction_options=self._text_extraction_options,
source=self._source,
output_label_options=self.output_label_options,
sort_mode=self.sort_mode,
**self._kwargs,
)
except Exception as e:
Expand All @@ -547,7 +553,7 @@ def partition(self, document: Document) -> Document:

document.elements = elements

bbox_sort_document(document)
sort_document(document, mode=self.sort_mode)

return document

Expand Down
Loading
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载