From ec0507eb70bc6a6fad276f843c3c14b1c780963b Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Wed, 14 May 2025 23:14:56 -0700 Subject: [PATCH 1/6] Initial draft of X-Y Cut reading order for Sycamore (Hackathon) --- lib/sycamore/sycamore/data/element.py | 10 + .../tests/unit/utils/test_xycut_sort.py | 95 +++++++ lib/sycamore/sycamore/utils/xycut.py | 238 ++++++++++++++++++ 3 files changed, 343 insertions(+) create mode 100644 lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py create mode 100644 lib/sycamore/sycamore/utils/xycut.py diff --git a/lib/sycamore/sycamore/data/element.py b/lib/sycamore/sycamore/data/element.py index 5d51c48e2..bfd7ca36c 100644 --- a/lib/sycamore/sycamore/data/element.py +++ b/lib/sycamore/sycamore/data/element.py @@ -115,6 +115,16 @@ def field_to_value(self, field: str) -> Any: return dotted_lookup(self, field) + def __lt__(self, other) -> bool: + sidx = self.element_index or 0 + oidx = other.element_index or 0 + return sidx < oidx + + def __gt__(self, other) -> bool: + sidx = self.element_index or 0 + oidx = other.element_index or 0 + return sidx > oidx + class ImageElement(Element): def __init__( diff --git a/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py new file mode 100644 index 000000000..fc0094064 --- /dev/null +++ b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py @@ -0,0 +1,95 @@ +from typing import Any, Optional + +from sycamore.data import Document, Element +from sycamore.utils.xycut import ( + xycut_sorted_elements, + xycut_sorted_page, + xycut_sort_document, +) + + +def mkElem( + left: float, top: float, right: float, bot: float, page: Optional[int] = None, type: str = "Text" +) -> Element: + d: dict[str, Any] = {"bbox": (left, top, right, bot), "type": type} + if page is not None: + d["properties"] = {"page_number": page} + return Element(d) + + +def test_page_basic() -> None: + # e1, e2 in left column, e1.top < e2.top + # e0, e3 in right column, e0.top < e3.top + # e4 full width, at top + e0 = mkElem(0.59, 0.25, 0.90, 0.60) + e1 = mkElem(0.10, 0.26, 0.40, 0.51) + e2 = mkElem(0.10, 0.58, 0.40, 0.90) + e3 = mkElem(0.60, 0.65, 0.90, 0.85) + e4 = mkElem(0.15, 0.10, 0.85, 0.15) + elems = [e0, e1, e2, e3, e4] + elems = xycut_sorted_page(elems) + answer = [e4, e1, e2, e0, e3] + assert elems == answer + + +def test_elements_basic() -> None: + # e1.top < e0.top = e2.top, e0.left < e2.left both on left + e0 = mkElem(0.20, 0.50, 0.45, 0.70, 3) + e1 = mkElem(0.20, 0.21, 0.45, 0.41, 3) + e2 = mkElem(0.51, 0.50, 0.90, 0.70, 3) + + # e4, e5 in left column, e4.top < e5.top + # e3, e6 in right column, e3.top < e6.top + e3 = mkElem(0.59, 0.25, 0.90, 0.60, 1) + e4 = mkElem(0.10, 0.26, 0.40, 0.51, 1) + e5 = mkElem(0.10, 0.58, 0.40, 0.90, 1) + e6 = mkElem(0.60, 0.65, 0.90, 0.85, 1) + + # all the same, test stable + e7 = mkElem(0.20, 0.21, 0.90, 0.41, 2) + e8 = mkElem(0.20, 0.21, 0.90, 0.41, 2) + e9 = mkElem(0.20, 0.21, 0.90, 0.41, 2) + + elems = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9] + elems = xycut_sorted_elements(elems) + answer = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2] + assert elems == answer + assert_element_index_sorted(elems) + + +def test_document_basic() -> None: + e0 = mkElem(0.1, 0.5, 0.9, 0.6, 3) + e1 = mkElem(0.1, 0.1, 0.9, 0.2, 3) + e2 = mkElem(0.1, 0.5, 0.9, 0.6, 1) + e3 = mkElem(0.1, 0.1, 0.9, 0.2, 1) + e4 = mkElem(0.1, 0.5, 0.9, 0.6, 2) + e5 = mkElem(0.1, 0.1, 0.9, 0.2, 2) + doc = Document() + doc.elements = [e0, e1, e2, e3, e4, e5] + xycut_sort_document(doc) + answer = [e3, e2, e5, e4, e1, e0] + assert doc.elements == answer + assert_element_index_sorted(doc.elements) + + +def test_page_footer() -> None: + # e1, e2 in left column, e1.top < e2.top + # e0, e3 in right column, e0.top < e3.top + # e4 full width, at top + # e5 in left column, but page-footer + e0 = mkElem(0.59, 0.25, 0.90, 0.60) + e1 = mkElem(0.10, 0.26, 0.40, 0.51) + e2 = mkElem(0.10, 0.58, 0.40, 0.90) + e3 = mkElem(0.60, 0.65, 0.90, 0.85) + e4 = mkElem(0.15, 0.10, 0.85, 0.15) + e5 = mkElem(0.25, 0.95, 0.75, 1.0, type="Page-footer") + elems = [e0, e1, e2, e3, e4, e5] + elems = xycut_sorted_page(elems) + answer = [e4, e1, e2, e0, e3, e5] + assert elems == answer + + +def assert_element_index_sorted(elements: list[Element]): + assert all( + elements[i].element_index < elements[i + 1].element_index for i in range(len(elements) - 1) # type: ignore + ) diff --git a/lib/sycamore/sycamore/utils/xycut.py b/lib/sycamore/sycamore/utils/xycut.py new file mode 100644 index 000000000..4f260a29c --- /dev/null +++ b/lib/sycamore/sycamore/utils/xycut.py @@ -0,0 +1,238 @@ +""" +Sort Elements into reading order based on bboxes using X-Y Cut. + +Most useful functions are at the bottom of the file. +""" + +from io import StringIO +from typing import Generator + +from sycamore.data import Document, Element +from sycamore.utils.bbox_sort import collect_pages + +type ElemList = list[Element] +type BeginEndList = list[tuple[float, int, Element]] + +XAXIS = 0 +YAXIS = 1 + +OPEN = 1 +CLOSE = 0 # sorts first + + +class NodeBase: + """Like a B-tree of Elements.""" + + def __str__(self) -> str: + sio = StringIO() + self.to_str_impl("", sio) + return sio.getvalue().rstrip() + + def to_text(self) -> str: + sio = StringIO() + self.to_text_impl(sio) + return sio.getvalue().rstrip() + + def to_elems(self) -> ElemList: + elems: ElemList = [] + self.to_elems_impl(elems) + return elems + + def to_str_impl(self, pfx: str, sio: StringIO) -> None: + raise NotImplementedError() + + def to_text_impl(self, sio: StringIO) -> None: + raise NotImplementedError() + + def to_elems_impl(self, elems: ElemList) -> None: + raise NotImplementedError() + + +class NodeInner(NodeBase): + """Node whose chidren are all Nodes.""" + + nodes: list[NodeBase] + + def __init__(self) -> None: + super().__init__() + self.nodes = [] + + def append(self, node: NodeBase) -> "NodeInner": + self.nodes.append(node) + return self + + def to_str_impl(self, pfx: str, sio: StringIO) -> None: + for idx, node in enumerate(self.nodes): + node.to_str_impl(f"{pfx}{idx:02d}.", sio) + + def to_text_impl(self, sio: StringIO) -> None: + for node in self.nodes: + node.to_text_impl(sio) + + def to_elems_impl(self, elems: ElemList) -> None: + for node in self.nodes: + node.to_elems_impl(elems) + + +class NodeLeaf(NodeBase): + """Node whose children are all lists of elements.""" + + elists: list[ElemList] + + def __init__(self) -> None: + super().__init__() + self.elists = [[]] + + def append(self, elem: Element) -> "NodeLeaf": + self.elists[-1].append(elem) + return self + + def extend(self, elist: ElemList) -> "NodeLeaf": + self.elists[-1].extend(elist) + return self + + def advance(self) -> "NodeLeaf": + self.elists.append([]) + return self + + def finalize(self) -> "NodeLeaf": + if not self.elists[-1]: + self.elists.pop() + return self + + def cansplit(self) -> bool: + try: + elem = self.elists[0][0] + next = self.elists[1] # noqa: F841 + return isinstance(elem, Element) + except (IndexError, TypeError): + return False + + def to_str_impl(self, pfx: str, sio: StringIO) -> None: + for idx, elist in enumerate(self.elists): + for i, elem in enumerate(elist): + bbox = get_bbox(elem) + sio.write(f"{pfx}{i:02d}:{bbox}\n") + # if tr := elem.text_representation: + # nonl = tr.replace("\n", " ") + # sio.write(f"{pfx}{i:02d}:[{nonl}]\n") + + def to_text_impl(self, sio: StringIO) -> None: + for elist in self.elists: + for elem in elist: + if tr := elem.text_representation: + sio.write(tr) + sio.write("\n\n") + + def to_elems_impl(self, elems: ElemList) -> None: + for elist in self.elists: + elems.extend(elist) + + +############################################################################### + + +def get_bbox(elem: Element) -> tuple: + if bbox := elem.data.get("bbox"): + return bbox + return (1.0, 1.0, 1.0, 1.0) + + +def make_begin_end(elems: ElemList, axis: int) -> BeginEndList: + """Returns array of (coord, isopen, elem)""" + ary: BeginEndList = [] + for elem in elems: + bbox = get_bbox(elem) + aa = bbox[axis] + bb = bbox[axis + 2] + if bb < aa: + aa, bb = bb, aa + ary.append((aa, OPEN, elem)) + ary.append((bb, CLOSE, elem)) + ary.sort() + return ary + + +def gen_overlap(ary: BeginEndList) -> Generator[tuple, None, None]: + """Yields tuple (coord, isopen, elem, count, width)""" + count = 0 + cur = ary[0] + for ii in range(len(ary) - 1): # end of ary will be a close + if cur[1]: + count += 1 + else: + count -= 1 + next = ary[ii + 1] + width = 0.0 if count else next[0] - cur[0] + yield (*cur, count, width) + cur = next + + +def widest_cut(order: BeginEndList) -> tuple[float, Element]: + rv = (0.0, order[0][2]) + if len(order) < 3: # order is twice as long as elems + return rv + for _, _, elem, count, width in gen_overlap(order): + if count == 0: + rv = max(rv, (width, elem)) + return rv + + +def choose_axis(elems: ElemList) -> tuple[BeginEndList, Element]: + xorder = make_begin_end(elems, XAXIS) + yorder = make_begin_end(elems, YAXIS) + xw, xe = widest_cut(xorder) + yw, ye = widest_cut(yorder) + if xw < yw: + return (yorder, ye) + return (xorder, xe) + + +def cleave_elems(elems: ElemList) -> NodeLeaf: + """Binary split across widest gap.""" + node = NodeLeaf() + if len(elems) < 2: + return node.extend(elems).finalize() + order, cut_after = choose_axis(elems) + for _, isopen, elem, cnt, width in gen_overlap(order): + if isopen: + node.append(elem) + if elem == cut_after: + node.advance() + return node.finalize() + + +def divide_node(node: NodeLeaf) -> NodeInner: + """Return replacement Node. Assume input is leaf.""" + inner = NodeInner() + for elist in node.elists: + subnode = cleave_elems(elist) + if subnode.cansplit(): + inner.append(divide_node(subnode)) # recursive step + else: + inner.append(subnode) + return inner + + +def xycut_sorted_page(elems: ElemList) -> ElemList: + if len(elems) < 2: + return elems + flat = NodeLeaf().extend(elems) + tree = divide_node(flat) + return tree.to_elems() + + +def xycut_sorted_elements(elements: ElemList, update_indices: bool = True) -> ElemList: + flat: ElemList = [] + pages = collect_pages(elements) + for page in pages: + elems = xycut_sorted_page(page) + flat.extend(elems) + if update_indices: + for idx, elem in enumerate(flat): + elem.element_index = idx + return flat + + +def xycut_sort_document(doc: Document, update_indices: bool = True) -> None: + doc.elements = xycut_sorted_elements(doc.elements, update_indices) From af7cf24cb05e9c6435ab2f547cdd11b2b2cf2bfa Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Thu, 15 May 2025 14:32:35 -0700 Subject: [PATCH 2/6] Minimal parameters to pick sort mode, default to bbox. --- .../tests/unit/transforms/test_partition.py | 25 +------------------ .../sycamore/transforms/detr_partitioner.py | 7 +++++- .../sycamore/transforms/merge_elements.py | 8 +++++- lib/sycamore/sycamore/transforms/partition.py | 16 ++++++++++-- lib/sycamore/sycamore/utils/markdown.py | 2 +- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py b/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py index 24b78bf13..27a843bd6 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py @@ -12,7 +12,7 @@ UnstructuredPPTXPartitioner, SycamorePartitioner, ) -from sycamore.utils.bbox_sort import bbox_sorted_elements +from sycamore.utils.xycut import xycut_sorted_elements from sycamore.connectors.file import BinaryScan from sycamore.tests.config import TEST_DIR @@ -156,29 +156,6 @@ def test_deformable_detr_partition(self, mocker, path, partition_count) -> None: doc = Document.from_row(docset.take(limit=1)[0]) assert len(doc.elements) == partition_count - def test_sycamore_partitioner_elements_reorder(self) -> None: - # e1.y1 < e0.y1 = e2.y1, e0.x1 < e2.x1 both on left - e0 = Element({"bbox": (0.20, 0.50, 0.45, 0.70), "properties": {"page_number": 3}}) - e1 = Element({"bbox": (0.20, 0.21, 0.45, 0.41), "properties": {"page_number": 3}}) - e2 = Element({"bbox": (0.51, 0.50, 0.90, 0.70), "properties": {"page_number": 3}}) - - # e4, e5 in left column, e4.y < e5.y1; e3, e6 in right columns, e3.y1 < e6.y1 - e3 = Element({"bbox": (0.52, 0.21, 0.90, 0.45), "properties": {"page_number": 1}}) - e4 = Element({"bbox": (0.10, 0.21, 0.48, 0.46), "properties": {"page_number": 1}}) - e5 = Element({"bbox": (0.10, 0.58, 0.48, 0.90), "properties": {"page_number": 1}}) - e6 = Element({"bbox": (0.58, 0.51, 0.90, 0.85), "properties": {"page_number": 1}}) - - # all the same, test stable - e7 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}}) - e8 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}}) - e9 = Element({"bbox": (0.20, 0.21, 0.90, 0.41), "properties": {"page_number": 2}}) - - elements = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9] - elements = bbox_sorted_elements(elements) - result = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2] - - assert elements == result - @pytest.mark.skip( reason="Breaks as of 2024-10-18. See https://github.com/aryn-ai/sycamore/actions/runs/11411766096" ) diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 1a478a2c1..81241ed54 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -20,6 +20,7 @@ from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR from sycamore.utils import choose_device from sycamore.utils.bbox_sort import bbox_sort_page +from sycamore.utils.xycut import xycut_sorted_page from sycamore.utils.cache import Cache from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes from sycamore.utils.import_utils import requires_modules @@ -172,6 +173,7 @@ def partition_pdf( text_extraction_options: dict[str, Any] = {}, source: str = "", output_label_options: dict[str, Any] = {}, + sort_mode: Optional[str] = None, **kwargs, ) -> list[Element]: if use_partitioning_service: @@ -223,7 +225,10 @@ def partition_pdf( promote_title(page, title_candidate_elements) else: promote_title(page) - bbox_sort_page(page) + if sort_mode and sort_mode == "xycut": + page = xycut_sorted_page(page) + else: + bbox_sort_page(page) elements.extend(page) if output_format == "markdown": md = elements_to_markdown(elements) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 1b047ca6c..830c11fb7 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -14,6 +14,7 @@ from sycamore.transforms.llm_query import LLMTextQueryAgent from sycamore.llms import LLM from sycamore.utils.bbox_sort import bbox_sort_document +from sycamore.utils.xycut import xycut_sort_document class ElementMerger(ABC): @@ -463,12 +464,14 @@ def __init__( regex_pattern: Optional[Pattern] = None, llm_prompt: Optional[str] = None, llm: Optional[LLM] = None, + sort_mode: Optional[str] = None, *args, **kwargs, ): self.regex_pattern = regex_pattern self.llm_prompt = llm_prompt self.llm = llm + self.sort_mode = sort_mode def merge_elements(self, document: Document) -> Document: @@ -492,7 +495,10 @@ def merge_elements(self, document: Document) -> Document: new_table_elements[-1]["properties"]["table_continuation"] = False other_elements.extend(new_table_elements) document.elements = other_elements - bbox_sort_document(document) + if self.sort_mode and self.sort_mode == "xycut": + xycut_sort_document(document) + else: + bbox_sort_document(document) return document diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index 89164beab..4210cf39b 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -14,6 +14,7 @@ from sycamore.utils import choose_device from sycamore.utils.aryn_config import ArynConfig from sycamore.utils.bbox_sort import bbox_sort_document +from sycamore.utils.xycut import xycut_sort_document from sycamore.transforms.detr_partitioner_config import ( ARYN_DETR_MODEL, @@ -164,6 +165,7 @@ def __init__( min_partition_length: Optional[int] = 500, include_metadata: bool = True, retain_coordinates: bool = False, + sort_mode: Optional[str] = None, ): super().__init__(device="cpu") self._include_page_breaks = include_page_breaks @@ -174,6 +176,7 @@ def __init__( self._min_partition_length = min_partition_length self._include_metadata = include_metadata self._retain_coordinates = retain_coordinates + self._sort_mode = sort_mode @staticmethod def to_element(dict: dict[str, Any], element_index: Optional[int] = None, retain_coordinates=False) -> Element: @@ -233,7 +236,10 @@ def partition(self, document: Document) -> Document: ] del elements - bbox_sort_document(document) + if self._sort_mode and self._sort_mode == "xycut": + xycut_sort_document(document) + else: + bbox_sort_document(document) return document @@ -466,6 +472,7 @@ def __init__( text_extraction_options: dict[str, Any] = {}, source: str = "", output_label_options: dict[str, Any] = {}, + sort_mode: Optional[str] = None, **kwargs, ): if use_partitioning_service: @@ -508,6 +515,7 @@ def __init__( self._text_extraction_options = text_extraction_options self._source = source self.output_label_options = output_label_options + self.sort_mode = sort_mode self._kwargs = kwargs @timetrace("SycamorePdf") @@ -539,6 +547,7 @@ def partition(self, document: Document) -> Document: text_extraction_options=self._text_extraction_options, source=self._source, output_label_options=self.output_label_options, + sort_mode=self.sort_mode, **self._kwargs, ) except Exception as e: @@ -547,7 +556,10 @@ def partition(self, document: Document) -> Document: document.elements = elements - bbox_sort_document(document) + if self.sort_mode and self.sort_mode == "xycut": + xycut_sort_document(document) + else: + bbox_sort_document(document) return document diff --git a/lib/sycamore/sycamore/utils/markdown.py b/lib/sycamore/sycamore/utils/markdown.py index a1fe6c169..688fc571c 100644 --- a/lib/sycamore/sycamore/utils/markdown.py +++ b/lib/sycamore/sycamore/utils/markdown.py @@ -35,7 +35,7 @@ def escape_str(s: str) -> str: def elements_to_markdown(elems: list[Element], opts: Optional[dict[str, Any]] = None) -> str: """ This is the main function of interest. - Assumes elements are sorted as per bbox_sort.bbox_sort_document(). + Assumes elements are sorted as per bbox_sort or xycut. """ skip_types = {"image"} if not (opts and opts.get("include_headers")): From 3bcc5db5de918e2e559d1a93b9f3437dc2212d81 Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Fri, 16 May 2025 00:36:16 -0700 Subject: [PATCH 3/6] lint --- lib/sycamore/sycamore/tests/unit/transforms/test_partition.py | 3 +-- lib/sycamore/sycamore/utils/xycut.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py b/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py index 27a843bd6..324daf912 100644 --- a/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py +++ b/lib/sycamore/sycamore/tests/unit/transforms/test_partition.py @@ -4,7 +4,7 @@ import pytest from ray.data import Dataset -from sycamore.data import Document, Element +from sycamore.data import Document from sycamore.transforms.partition import ( Partition, HtmlPartitioner, @@ -12,7 +12,6 @@ UnstructuredPPTXPartitioner, SycamorePartitioner, ) -from sycamore.utils.xycut import xycut_sorted_elements from sycamore.connectors.file import BinaryScan from sycamore.tests.config import TEST_DIR diff --git a/lib/sycamore/sycamore/utils/xycut.py b/lib/sycamore/sycamore/utils/xycut.py index 4f260a29c..5e8e8c3bd 100644 --- a/lib/sycamore/sycamore/utils/xycut.py +++ b/lib/sycamore/sycamore/utils/xycut.py @@ -10,8 +10,8 @@ from sycamore.data import Document, Element from sycamore.utils.bbox_sort import collect_pages -type ElemList = list[Element] -type BeginEndList = list[tuple[float, int, Element]] +ElemList = list[Element] +BeginEndList = list[tuple[float, int, Element]] XAXIS = 0 YAXIS = 1 From 562dd5809864b439da65d45d06900e7094398fbc Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Fri, 16 May 2025 09:01:48 -0700 Subject: [PATCH 4/6] Doc sort_mode. --- lib/sycamore/sycamore/transforms/partition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index 4210cf39b..862cdc683 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -435,6 +435,7 @@ class ArynPartitioner(Partitioner): Here is an example set of output label options: {"promote_title": True, "title_candidate_elements": ["Section-header", "Caption"]} default: None (no element is promoted to "Title") + sort_mode: Reading order algorithm: bbox (default) or xycut. kwargs: Additional keyword arguments to pass to the remote partitioner. Example: The following shows an example of using the ArynPartitioner to partition a PDF and extract From bad4cd673a0845f66f0ff2dd564630d00fe61e6b Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Mon, 19 May 2025 15:45:21 -0700 Subject: [PATCH 5/6] Fixed cleave bug; handle no-cut case; realistic test; better debug output --- .../tests/unit/utils/test_xycut_sort.py | 56 ++++++++++++++ lib/sycamore/sycamore/utils/xycut.py | 77 +++++++++++++------ 2 files changed, 111 insertions(+), 22 deletions(-) diff --git a/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py index fc0094064..eb5fdf955 100644 --- a/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py +++ b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py @@ -89,6 +89,62 @@ def test_page_footer() -> None: assert elems == answer +# bbox coordinates and reading order from page 9 of +# https://www.aemps.gob.es/medicamentosUsoHumano/informesPublicos/docs/IPT-viekirax-exviera.pdf +g_viekirax_boxes = [ + (0.9159, 0.0231, 0.9825, 0.1116, 0), + (0.5336, 0.1245, 0.9489, 0.1612, 15), + (0.0951, 0.1245, 0.5205, 0.1486, 1), + (0.0945, 0.1524, 0.5202, 0.3006, 2), + (0.5339, 0.1686, 0.9478, 0.2051, 16), + (0.5340, 0.2126, 0.9529, 0.2492, 17), + (0.5335, 0.2565, 0.8968, 0.2808, 18), + (0.5571, 0.2820, 0.9482, 0.3055, 19), + (0.0945, 0.3046, 0.5198, 0.3655, 3), + (0.5336, 0.3129, 0.8991, 0.3371, 20), + (0.5572, 0.3384, 0.9484, 0.3619, 21), + (0.5332, 0.3689, 0.8977, 0.3932, 22), + (0.0945, 0.3693, 0.5180, 0.3938, 4), + (0.5574, 0.3943, 0.9482, 0.4182, 23), + (0.0947, 0.3974, 0.5187, 0.4221, 5), + (0.5325, 0.4255, 0.9324, 0.4497, 24), + (0.0950, 0.4258, 0.5195, 0.4499, 6), + (0.5324, 0.4575, 0.7744, 0.4693, 25), + (0.0946, 0.4709, 0.2071, 0.4842, 7), + (0.0948, 0.4911, 0.5152, 0.5287, 8), + (0.0945, 0.5355, 0.5029, 0.5725, 9), + (0.0946, 0.5793, 0.4987, 0.6289, 10), + (0.0939, 0.6359, 0.5168, 0.7223, 11), + (0.0948, 0.7290, 0.5058, 0.7785, 12), + (0.0947, 0.7851, 0.5160, 0.8472, 13), + (0.0946, 0.8544, 0.5148, 0.8913, 14), + (0.5568, 0.4700, 0.9550, 0.4941, 26), + (0.5334, 0.5016, 0.7799, 0.5134, 27), + (0.5556, 0.5140, 0.9545, 0.5384, 28), + (0.5324, 0.5452, 0.8140, 0.5575, 29), + (0.5566, 0.5581, 0.9507, 0.5820, 30), + (0.5323, 0.5894, 0.8261, 0.6018, 31), + (0.5564, 0.6020, 0.9550, 0.6265, 32), + (0.5330, 0.6336, 0.9540, 0.6951, 33), + (0.5323, 0.7021, 0.9566, 0.7641, 34), + (0.5323, 0.7706, 0.9559, 0.8326, 35), + (0.5321, 0.8392, 0.9508, 0.9140, 36), + (0.4780, 0.9469, 0.5713, 0.9590, 37), +] + + +def test_viekirax() -> None: + elems: list[Element] = [] + for tup in g_viekirax_boxes: + elem = mkElem(tup[0], tup[1], tup[2], tup[3]) + elem.text_representation = str(tup[4]) + elems.append(elem) + elems = xycut_sorted_page(elems) + for ii, elem in enumerate(elems): + s = str(ii) + assert elem.text_representation == s + + def assert_element_index_sorted(elements: list[Element]): assert all( elements[i].element_index < elements[i + 1].element_index for i in range(len(elements) - 1) # type: ignore diff --git a/lib/sycamore/sycamore/utils/xycut.py b/lib/sycamore/sycamore/utils/xycut.py index 5e8e8c3bd..7ddccfb82 100644 --- a/lib/sycamore/sycamore/utils/xycut.py +++ b/lib/sycamore/sycamore/utils/xycut.py @@ -5,7 +5,7 @@ """ from io import StringIO -from typing import Generator +from typing import Generator, Optional from sycamore.data import Document, Element from sycamore.utils.bbox_sort import collect_pages @@ -63,7 +63,7 @@ def append(self, node: NodeBase) -> "NodeInner": def to_str_impl(self, pfx: str, sio: StringIO) -> None: for idx, node in enumerate(self.nodes): - node.to_str_impl(f"{pfx}{idx:02d}.", sio) + node.to_str_impl(f"{pfx}{idx}.", sio) def to_text_impl(self, sio: StringIO) -> None: for node in self.nodes: @@ -111,11 +111,8 @@ def cansplit(self) -> bool: def to_str_impl(self, pfx: str, sio: StringIO) -> None: for idx, elist in enumerate(self.elists): for i, elem in enumerate(elist): - bbox = get_bbox(elem) - sio.write(f"{pfx}{i:02d}:{bbox}\n") - # if tr := elem.text_representation: - # nonl = tr.replace("\n", " ") - # sio.write(f"{pfx}{i:02d}:[{nonl}]\n") + s = elem_to_str(elem) + sio.write(f"{pfx}{i}: {s}\n") def to_text_impl(self, sio: StringIO) -> None: for elist in self.elists: @@ -132,12 +129,25 @@ def to_elems_impl(self, elems: ElemList) -> None: ############################################################################### -def get_bbox(elem: Element) -> tuple: +def get_bbox(elem: Element) -> tuple[float, float, float, float]: if bbox := elem.data.get("bbox"): return bbox return (1.0, 1.0, 1.0, 1.0) +def bbox_to_str(bbox: tuple[float, float, float, float]) -> str: + a, b, c, d = [int(1000 * x) for x in bbox] + return f"({a:03d},{b:03d})({c:03d},{d:03d})" + + +def elem_to_str(elem: Element) -> str: + s = bbox_to_str(get_bbox(elem)) + if tr := elem.text_representation: + nonl = tr.replace("\n", " ")[:32] + s += f"[{nonl}]" + return s + + def make_begin_end(elems: ElemList, axis: int) -> BeginEndList: """Returns array of (coord, isopen, elem)""" ary: BeginEndList = [] @@ -157,19 +167,24 @@ def gen_overlap(ary: BeginEndList) -> Generator[tuple, None, None]: """Yields tuple (coord, isopen, elem, count, width)""" count = 0 cur = ary[0] - for ii in range(len(ary) - 1): # end of ary will be a close - if cur[1]: + n = len(ary) + for ii in range(n): + width = -1.0 + if cur[1] == OPEN: count += 1 + next = ary[ii + 1] else: count -= 1 - next = ary[ii + 1] - width = 0.0 if count else next[0] - cur[0] + if (ii + 1) < n: + next = ary[ii + 1] + if count == 0: + width = next[0] - cur[0] yield (*cur, count, width) cur = next def widest_cut(order: BeginEndList) -> tuple[float, Element]: - rv = (0.0, order[0][2]) + rv = (-1.0, order[0][2]) if len(order) < 3: # order is twice as long as elems return rv for _, _, elem, count, width in gen_overlap(order): @@ -178,27 +193,39 @@ def widest_cut(order: BeginEndList) -> tuple[float, Element]: return rv -def choose_axis(elems: ElemList) -> tuple[BeginEndList, Element]: +def choose_axis(elems: ElemList) -> Optional[tuple[BeginEndList, Element]]: xorder = make_begin_end(elems, XAXIS) yorder = make_begin_end(elems, YAXIS) xw, xe = widest_cut(xorder) yw, ye = widest_cut(yorder) + if max(xw, yw) < 0.0: + return None if xw < yw: + # yval = get_bbox(ye)[3] + # s = elem_to_str(ye) + # print(f"Horiz cut y={yval:.3f} {s}") return (yorder, ye) - return (xorder, xe) + else: + # xval = get_bbox(xe)[2] + # s = elem_to_str(xe) + # print(f"Vert cut x={xval:.3f} {s}") + return (xorder, xe) def cleave_elems(elems: ElemList) -> NodeLeaf: """Binary split across widest gap.""" node = NodeLeaf() if len(elems) < 2: - return node.extend(elems).finalize() - order, cut_after = choose_axis(elems) - for _, isopen, elem, cnt, width in gen_overlap(order): - if isopen: - node.append(elem) - if elem == cut_after: - node.advance() + node.extend(elems) + elif (choice := choose_axis(elems)) is None: + node.extend(elems) + else: + order, cut_after = choice + for _, isopen, elem, cnt, width in gen_overlap(order): + if not isopen: + node.append(elem) + if elem == cut_after: + node.advance() return node.finalize() @@ -207,6 +234,9 @@ def divide_node(node: NodeLeaf) -> NodeInner: inner = NodeInner() for elist in node.elists: subnode = cleave_elems(elist) + # print("....................") + # print(subnode) + # print("^^^^^^^^^^^^^^^^^^^^") if subnode.cansplit(): inner.append(divide_node(subnode)) # recursive step else: @@ -218,6 +248,9 @@ def xycut_sorted_page(elems: ElemList) -> ElemList: if len(elems) < 2: return elems flat = NodeLeaf().extend(elems) + # print("VVVVVVVVVVVVVVVVVVVV") + # print(flat) + # print("AAAAAAAAAAAAAAAAAAAA") tree = divide_node(flat) return tree.to_elems() From fa5a3aac25d3fc46e1b52f399ef241188607d1ee Mon Sep 17 00:00:00 2001 From: Alex Meyer Date: Wed, 21 May 2025 16:36:28 -0700 Subject: [PATCH 6/6] Acted on PR comments. --- lib/sycamore/sycamore/data/element.py | 8 +- .../tests/unit/utils/test_bbox_sort.py | 14 +-- .../tests/unit/utils/test_xycut_sort.py | 28 +++-- .../sycamore/transforms/detr_partitioner.py | 8 +- .../sycamore/transforms/merge_elements.py | 8 +- lib/sycamore/sycamore/transforms/partition.py | 13 +-- lib/sycamore/sycamore/utils/bbox_sort.py | 40 +------ lib/sycamore/sycamore/utils/element_sort.py | 68 +++++++++++ lib/sycamore/sycamore/utils/markdown.py | 2 +- lib/sycamore/sycamore/utils/xycut.py | 106 +++++++++--------- 10 files changed, 154 insertions(+), 141 deletions(-) create mode 100644 lib/sycamore/sycamore/utils/element_sort.py diff --git a/lib/sycamore/sycamore/data/element.py b/lib/sycamore/sycamore/data/element.py index bfd7ca36c..2a17965cb 100644 --- a/lib/sycamore/sycamore/data/element.py +++ b/lib/sycamore/sycamore/data/element.py @@ -116,13 +116,13 @@ def field_to_value(self, field: str) -> Any: return dotted_lookup(self, field) def __lt__(self, other) -> bool: - sidx = self.element_index or 0 - oidx = other.element_index or 0 + sidx = -1 if self.element_index is None else self.element_index + oidx = -1 if other.element_index is None else other.element_index return sidx < oidx def __gt__(self, other) -> bool: - sidx = self.element_index or 0 - oidx = other.element_index or 0 + sidx = -1 if self.element_index is None else self.element_index + oidx = -1 if other.element_index is None else other.element_index return sidx > oidx diff --git a/lib/sycamore/sycamore/tests/unit/utils/test_bbox_sort.py b/lib/sycamore/sycamore/tests/unit/utils/test_bbox_sort.py index dedd11504..3fb608742 100644 --- a/lib/sycamore/sycamore/tests/unit/utils/test_bbox_sort.py +++ b/lib/sycamore/sycamore/tests/unit/utils/test_bbox_sort.py @@ -1,14 +1,8 @@ from typing import Any, Optional from sycamore.data import Document, Element -from sycamore.utils.bbox_sort import ( - collect_pages, - col_tag, - find_overlap, - bbox_sorted_elements, - bbox_sort_page, - bbox_sort_document, -) +from sycamore.utils.bbox_sort import col_tag, find_overlap, bbox_sort_page +from sycamore.utils.element_sort import collect_pages, sort_elements, sort_document def mkElem( @@ -99,7 +93,7 @@ def test_elements_basic() -> None: e9 = mkElem(0.20, 0.21, 0.90, 0.41, 2) elems = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9] - elems = bbox_sorted_elements(elems) + sort_elements(elems) answer = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2] assert elems == answer assert_element_index_sorted(elems) @@ -114,7 +108,7 @@ def test_document_basic() -> None: e5 = mkElem(0.1, 0.1, 0.9, 0.2, 2) doc = Document() doc.elements = [e0, e1, e2, e3, e4, e5] - bbox_sort_document(doc) + sort_document(doc, mode="bbox") answer = [e3, e2, e5, e4, e1, e0] assert doc.elements == answer assert_element_index_sorted(doc.elements) diff --git a/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py index eb5fdf955..5039d6730 100644 --- a/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py +++ b/lib/sycamore/sycamore/tests/unit/utils/test_xycut_sort.py @@ -1,11 +1,8 @@ from typing import Any, Optional from sycamore.data import Document, Element -from sycamore.utils.xycut import ( - xycut_sorted_elements, - xycut_sorted_page, - xycut_sort_document, -) +from sycamore.utils.xycut import xycut_sort_page +from sycamore.utils.element_sort import sort_elements, sort_document def mkElem( @@ -27,7 +24,7 @@ def test_page_basic() -> None: e3 = mkElem(0.60, 0.65, 0.90, 0.85) e4 = mkElem(0.15, 0.10, 0.85, 0.15) elems = [e0, e1, e2, e3, e4] - elems = xycut_sorted_page(elems) + xycut_sort_page(elems) answer = [e4, e1, e2, e0, e3] assert elems == answer @@ -51,7 +48,7 @@ def test_elements_basic() -> None: e9 = mkElem(0.20, 0.21, 0.90, 0.41, 2) elems = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9] - elems = xycut_sorted_elements(elems) + sort_elements(elems, mode="xycut") answer = [e4, e5, e3, e6, e7, e8, e9, e1, e0, e2] assert elems == answer assert_element_index_sorted(elems) @@ -66,7 +63,7 @@ def test_document_basic() -> None: e5 = mkElem(0.1, 0.1, 0.9, 0.2, 2) doc = Document() doc.elements = [e0, e1, e2, e3, e4, e5] - xycut_sort_document(doc) + sort_document(doc, mode=xycut_sort_page) answer = [e3, e2, e5, e4, e1, e0] assert doc.elements == answer assert_element_index_sorted(doc.elements) @@ -84,11 +81,22 @@ def test_page_footer() -> None: e4 = mkElem(0.15, 0.10, 0.85, 0.15) e5 = mkElem(0.25, 0.95, 0.75, 1.0, type="Page-footer") elems = [e0, e1, e2, e3, e4, e5] - elems = xycut_sorted_page(elems) + xycut_sort_page(elems) answer = [e4, e1, e2, e0, e3, e5] assert elems == answer +def test_no_cut() -> None: + e0 = mkElem(0.40, 0.70, 0.90, 0.90) + e1 = mkElem(0.10, 0.40, 0.30, 0.90) + e2 = mkElem(0.70, 0.10, 0.90, 0.60) + e3 = mkElem(0.10, 0.10, 0.60, 0.30) + elems = [e0, e1, e2, e3] + xycut_sort_page(elems) + answer = [e3, e1, e0, e2] # what bbox_sort gives + assert elems == answer + + # bbox coordinates and reading order from page 9 of # https://www.aemps.gob.es/medicamentosUsoHumano/informesPublicos/docs/IPT-viekirax-exviera.pdf g_viekirax_boxes = [ @@ -139,7 +147,7 @@ def test_viekirax() -> None: elem = mkElem(tup[0], tup[1], tup[2], tup[3]) elem.text_representation = str(tup[4]) elems.append(elem) - elems = xycut_sorted_page(elems) + xycut_sort_page(elems) for ii, elem in enumerate(elems): s = str(ii) assert elem.text_representation == s diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 81241ed54..eb574ed3d 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -19,8 +19,7 @@ from sycamore.data.element import create_element from sycamore.transforms.table_structure.extract import DEFAULT_TABLE_STRUCTURE_EXTRACTOR from sycamore.utils import choose_device -from sycamore.utils.bbox_sort import bbox_sort_page -from sycamore.utils.xycut import xycut_sorted_page +from sycamore.utils.element_sort import sort_page from sycamore.utils.cache import Cache from sycamore.utils.image_utils import crop_to_bbox, image_to_bytes from sycamore.utils.import_utils import requires_modules @@ -225,10 +224,7 @@ def partition_pdf( promote_title(page, title_candidate_elements) else: promote_title(page) - if sort_mode and sort_mode == "xycut": - page = xycut_sorted_page(page) - else: - bbox_sort_page(page) + sort_page(page, mode=sort_mode) elements.extend(page) if output_format == "markdown": md = elements_to_markdown(elements) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 830c11fb7..d1239b91a 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -13,8 +13,7 @@ from sycamore.utils.merge_utils import combine_strs_min_newline from sycamore.transforms.llm_query import LLMTextQueryAgent from sycamore.llms import LLM -from sycamore.utils.bbox_sort import bbox_sort_document -from sycamore.utils.xycut import xycut_sort_document +from sycamore.utils.element_sort import sort_document class ElementMerger(ABC): @@ -495,10 +494,7 @@ def merge_elements(self, document: Document) -> Document: new_table_elements[-1]["properties"]["table_continuation"] = False other_elements.extend(new_table_elements) document.elements = other_elements - if self.sort_mode and self.sort_mode == "xycut": - xycut_sort_document(document) - else: - bbox_sort_document(document) + sort_document(document, mode=self.sort_mode) return document diff --git a/lib/sycamore/sycamore/transforms/partition.py b/lib/sycamore/sycamore/transforms/partition.py index 862cdc683..03fffdac8 100644 --- a/lib/sycamore/sycamore/transforms/partition.py +++ b/lib/sycamore/sycamore/transforms/partition.py @@ -13,8 +13,7 @@ from sycamore.utils.time_trace import timetrace from sycamore.utils import choose_device from sycamore.utils.aryn_config import ArynConfig -from sycamore.utils.bbox_sort import bbox_sort_document -from sycamore.utils.xycut import xycut_sort_document +from sycamore.utils.element_sort import sort_document from sycamore.transforms.detr_partitioner_config import ( ARYN_DETR_MODEL, @@ -236,10 +235,7 @@ def partition(self, document: Document) -> Document: ] del elements - if self._sort_mode and self._sort_mode == "xycut": - xycut_sort_document(document) - else: - bbox_sort_document(document) + sort_document(document, mode=self._sort_mode) return document @@ -557,10 +553,7 @@ def partition(self, document: Document) -> Document: document.elements = elements - if self.sort_mode and self.sort_mode == "xycut": - xycut_sort_document(document) - else: - bbox_sort_document(document) + sort_document(document, mode=self.sort_mode) return document diff --git a/lib/sycamore/sycamore/utils/bbox_sort.py b/lib/sycamore/sycamore/utils/bbox_sort.py index f94e1c35f..a244870f8 100644 --- a/lib/sycamore/sycamore/utils/bbox_sort.py +++ b/lib/sycamore/sycamore/utils/bbox_sort.py @@ -8,8 +8,7 @@ from typing import Optional -from sycamore.data import Document, Element -from sycamore.data.document import DocumentPropertyTypes +from sycamore.data import Element def elem_top_left(elem: Element) -> tuple: @@ -27,28 +26,6 @@ def elem_left_top(elem: Element) -> tuple: return (0.0, 0.0) -def collect_pages(elems: list[Element]) -> list[list[Element]]: - """ - Collect elements into page-number buckets. Basically like the first - stage of a radix sort. - """ - pagemap: dict[int, list[Element]] = {} - for elem in elems: - page = elem.properties.get(DocumentPropertyTypes.PAGE_NUMBER, 0) - ary = pagemap.get(page) - if ary: - ary.append(elem) - else: - pagemap[page] = [elem] - nums = list(pagemap.keys()) - nums.sort() - rv = [] - for page in nums: - ary = pagemap[page] - rv.append(ary) - return rv - - def col_tag(elem: Element) -> Optional[str]: bbox = elem.data.get("bbox") if bbox: @@ -141,18 +118,3 @@ def bbox_sort_page(elems: list[Element]) -> None: bbox_sort_based_on_tags(elems) for elem in elems: elem.data.pop("_coltag", None) # clean up tags - - -def bbox_sorted_elements(elements: list[Element], update_element_indexs: bool = True) -> list[Element]: - pages = collect_pages(elements) - for elems in pages: - bbox_sort_page(elems) - ordered_elements = [elem for elems in pages for elem in elems] # flatten - if update_element_indexs: - for idx, element in enumerate(ordered_elements): - element.element_index = idx - return ordered_elements - - -def bbox_sort_document(doc: Document, update_element_indexs: bool = True) -> None: - doc.elements = bbox_sorted_elements(doc.elements, update_element_indexs) diff --git a/lib/sycamore/sycamore/utils/element_sort.py b/lib/sycamore/sycamore/utils/element_sort.py new file mode 100644 index 000000000..c3d990934 --- /dev/null +++ b/lib/sycamore/sycamore/utils/element_sort.py @@ -0,0 +1,68 @@ +from typing import Optional, Union +from collections.abc import Callable + +from sycamore.data import Document, Element +from sycamore.data.document import DocumentPropertyTypes +from sycamore.utils.bbox_sort import bbox_sort_page +from sycamore.utils.xycut import xycut_sort_page + + +def nop_page(p: list[Element]) -> None: + pass + + +PAGE_SORT = { + "bbox": bbox_sort_page, + "xycut": xycut_sort_page, + "nop": nop_page, + None: bbox_sort_page, +} + + +def collect_pages(elems: list[Element]) -> list[list[Element]]: + """ + Collect elements into page-number buckets. Basically like the first + stage of a radix sort. + """ + pagemap: dict[int, list[Element]] = {} + for elem in elems: + page = elem.properties.get(DocumentPropertyTypes.PAGE_NUMBER, 0) + ary = pagemap.get(page) + if ary: + ary.append(elem) + else: + pagemap[page] = [elem] + nums = list(pagemap.keys()) + nums.sort() + rv = [] + for page in nums: + ary = pagemap[page] + rv.append(ary) + return rv + + +# This allows specification of the "page sort" building block either +# as a string ("bbox", "xycut") or Callable. +SortSpec = Union[Optional[str], Callable[[list[Element]], None]] + + +def sort_page(elems: list[Element], *, mode: SortSpec = None) -> None: + if callable(mode): + mode(elems) + else: + func = PAGE_SORT[mode] + func(elems) + + +def sort_elements(elements: list[Element], *, mode: SortSpec = None) -> None: + pages = collect_pages(elements) + for page in pages: + sort_page(page, mode=mode) + ordered = [elem for elems in pages for elem in elems] # flatten + for idx, elem in enumerate(ordered): + elem.element_index = idx + elements[:] = ordered # replace contents + + +def sort_document(doc: Document, *, mode: SortSpec = None) -> None: + sort_elements(doc.elements, mode=mode) diff --git a/lib/sycamore/sycamore/utils/markdown.py b/lib/sycamore/sycamore/utils/markdown.py index 688fc571c..72ea60dbe 100644 --- a/lib/sycamore/sycamore/utils/markdown.py +++ b/lib/sycamore/sycamore/utils/markdown.py @@ -35,7 +35,7 @@ def escape_str(s: str) -> str: def elements_to_markdown(elems: list[Element], opts: Optional[dict[str, Any]] = None) -> str: """ This is the main function of interest. - Assumes elements are sorted as per bbox_sort or xycut. + Assumes elements are in reading order (see element_sort.py) """ skip_types = {"image"} if not (opts and opts.get("include_headers")): diff --git a/lib/sycamore/sycamore/utils/xycut.py b/lib/sycamore/sycamore/utils/xycut.py index 7ddccfb82..93b737787 100644 --- a/lib/sycamore/sycamore/utils/xycut.py +++ b/lib/sycamore/sycamore/utils/xycut.py @@ -1,39 +1,44 @@ """ Sort Elements into reading order based on bboxes using X-Y Cut. -Most useful functions are at the bottom of the file. +Best to access this through element_sort.py """ from io import StringIO from typing import Generator, Optional -from sycamore.data import Document, Element -from sycamore.utils.bbox_sort import collect_pages +from sycamore.data import Element +from sycamore.utils.bbox_sort import bbox_sort_page # for fallback ElemList = list[Element] BeginEndList = list[tuple[float, int, Element]] -XAXIS = 0 -YAXIS = 1 +XAXIS = 0 # bbox[0] and bbox[2] are x +YAXIS = 1 # bbox[1] and bbox[3] are y OPEN = 1 CLOSE = 0 # sorts first +DEBUG = False + class NodeBase: """Like a B-tree of Elements.""" def __str__(self) -> str: + """Returns a string with hierarchy, bboxes and text snippets.""" sio = StringIO() self.to_str_impl("", sio) return sio.getvalue().rstrip() def to_text(self) -> str: + """Serializes the text in order.""" sio = StringIO() self.to_text_impl(sio) return sio.getvalue().rstrip() def to_elems(self) -> ElemList: + """Returns list of elements in order.""" elems: ElemList = [] self.to_elems_impl(elems) return elems @@ -103,7 +108,7 @@ def finalize(self) -> "NodeLeaf": def cansplit(self) -> bool: try: elem = self.elists[0][0] - next = self.elists[1] # noqa: F841 + next = self.elists[1] # make sure this exists # noqa: F841 return isinstance(elem, Element) except (IndexError, TypeError): return False @@ -112,7 +117,7 @@ def to_str_impl(self, pfx: str, sio: StringIO) -> None: for idx, elist in enumerate(self.elists): for i, elem in enumerate(elist): s = elem_to_str(elem) - sio.write(f"{pfx}{i}: {s}\n") + sio.write(f"{pfx}{idx},{i}: {s}\n") def to_text_impl(self, sio: StringIO) -> None: for elist in self.elists: @@ -132,7 +137,7 @@ def to_elems_impl(self, elems: ElemList) -> None: def get_bbox(elem: Element) -> tuple[float, float, float, float]: if bbox := elem.data.get("bbox"): return bbox - return (1.0, 1.0, 1.0, 1.0) + return (1.0, 1.0, 1.0, 1.0) # max values sort at end def bbox_to_str(bbox: tuple[float, float, float, float]) -> str: @@ -150,42 +155,38 @@ def elem_to_str(elem: Element) -> str: def make_begin_end(elems: ElemList, axis: int) -> BeginEndList: """Returns array of (coord, isopen, elem)""" - ary: BeginEndList = [] + bel: BeginEndList = [] for elem in elems: bbox = get_bbox(elem) aa = bbox[axis] bb = bbox[axis + 2] if bb < aa: aa, bb = bb, aa - ary.append((aa, OPEN, elem)) - ary.append((bb, CLOSE, elem)) - ary.sort() - return ary + bel.append((aa, OPEN, elem)) + bel.append((bb, CLOSE, elem)) + bel.sort() + return bel -def gen_overlap(ary: BeginEndList) -> Generator[tuple, None, None]: +def gen_overlap(bel: BeginEndList) -> Generator[tuple, None, None]: """Yields tuple (coord, isopen, elem, count, width)""" count = 0 - cur = ary[0] - n = len(ary) - for ii in range(n): + cur = bel[0] + n = len(bel) + for ii, cur in enumerate(bel): width = -1.0 if cur[1] == OPEN: count += 1 - next = ary[ii + 1] else: count -= 1 - if (ii + 1) < n: - next = ary[ii + 1] - if count == 0: - width = next[0] - cur[0] + if (count == 0) and ((ii + 1) < n): + width = bel[ii + 1][0] - cur[0] yield (*cur, count, width) - cur = next def widest_cut(order: BeginEndList) -> tuple[float, Element]: rv = (-1.0, order[0][2]) - if len(order) < 3: # order is twice as long as elems + if len(order) <= 2: # two means one element; no cut exists return rv for _, _, elem, count, width in gen_overlap(order): if count == 0: @@ -194,6 +195,7 @@ def widest_cut(order: BeginEndList) -> tuple[float, Element]: def choose_axis(elems: ElemList) -> Optional[tuple[BeginEndList, Element]]: + """Note that coordinate system is square, but pages are not.""" xorder = make_begin_end(elems, XAXIS) yorder = make_begin_end(elems, YAXIS) xw, xe = widest_cut(xorder) @@ -201,14 +203,16 @@ def choose_axis(elems: ElemList) -> Optional[tuple[BeginEndList, Element]]: if max(xw, yw) < 0.0: return None if xw < yw: - # yval = get_bbox(ye)[3] - # s = elem_to_str(ye) - # print(f"Horiz cut y={yval:.3f} {s}") + if DEBUG: + yval = get_bbox(ye)[3] + s = elem_to_str(ye) + print(f"Horiz cut y={yval:.3f} {s}") return (yorder, ye) else: - # xval = get_bbox(xe)[2] - # s = elem_to_str(xe) - # print(f"Vert cut x={xval:.3f} {s}") + if DEBUG: + xval = get_bbox(xe)[2] + s = elem_to_str(xe) + print(f"Vert cut x={xval:.3f} {s}") return (xorder, xe) @@ -234,9 +238,10 @@ def divide_node(node: NodeLeaf) -> NodeInner: inner = NodeInner() for elist in node.elists: subnode = cleave_elems(elist) - # print("....................") - # print(subnode) - # print("^^^^^^^^^^^^^^^^^^^^") + if DEBUG: + print("....................") + print(subnode) + print("^^^^^^^^^^^^^^^^^^^^") if subnode.cansplit(): inner.append(divide_node(subnode)) # recursive step else: @@ -244,28 +249,19 @@ def divide_node(node: NodeLeaf) -> NodeInner: return inner -def xycut_sorted_page(elems: ElemList) -> ElemList: +def xycut_sort_page(elems: ElemList) -> None: if len(elems) < 2: - return elems + return flat = NodeLeaf().extend(elems) - # print("VVVVVVVVVVVVVVVVVVVV") - # print(flat) - # print("AAAAAAAAAAAAAAAAAAAA") + if DEBUG: + print("VVVVVVVVVVVVVVVVVVVV") + print(flat) + print("AAAAAAAAAAAAAAAAAAAA") tree = divide_node(flat) - return tree.to_elems() - - -def xycut_sorted_elements(elements: ElemList, update_indices: bool = True) -> ElemList: - flat: ElemList = [] - pages = collect_pages(elements) - for page in pages: - elems = xycut_sorted_page(page) - flat.extend(elems) - if update_indices: - for idx, elem in enumerate(flat): - elem.element_index = idx - return flat - - -def xycut_sort_document(doc: Document, update_indices: bool = True) -> None: - doc.elements = xycut_sorted_elements(doc.elements, update_indices) + if (len(tree.nodes) == 1) and isinstance(tree.nodes[0], NodeLeaf) and (len(tree.nodes[0].elists) == 1): + # If we didn't make any cuts, fall back to old algorithm + if DEBUG: + print("Falling back to bbox_sort") + bbox_sort_page(elems) + return + elems[:] = tree.to_elems() # replace contents of list