+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion lib/sycamore/sycamore/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from sycamore.data.bbox import BoundingBox
from sycamore.data.table import Table
from sycamore.data.table import Table, TableCell
from sycamore.data.element import Element, ImageElement, TableElement
from sycamore.data.document import (
Document,
Expand All @@ -21,4 +21,5 @@
"OpenSearchQuery",
"OpenSearchQueryResult",
"Table",
"TableCell",
]
26 changes: 22 additions & 4 deletions lib/sycamore/sycamore/transforms/llm_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
llm_kwargs: dict = {},
per_element: bool = True,
element_type: Optional[str] = None,
table_cont: Optional[bool] = False,
):
self._llm = llm
self._prompt = prompt
Expand All @@ -57,16 +58,25 @@ def __init__(
self._format_kwargs = format_kwargs
self._number_of_elements = number_of_elements
self._element_type = element_type
self._table_cont = table_cont

def execute_query(self, document: Document) -> Document:
final_prompt = self._prompt
element_count = 0
prev_table = -1
if self._per_element or self._number_of_elements:
for idx, element in enumerate(document.elements):
if self._element_type and element.type != self._element_type:
continue
if self._per_element:
document.elements[idx] = self._query_text_object(element)
if not self._table_cont:
document.elements[idx] = self._query_text_object(element)
else:
if prev_table >= 0:
document.elements[idx] = self._query_text_object(element, document.elements[prev_table])
else:
document.elements[idx] = self._query_text_object(element)
prev_table = idx
else:
final_prompt += "\n" + element["text_representation"]
if self._number_of_elements:
Expand All @@ -83,7 +93,9 @@ def execute_query(self, document: Document) -> Document:
return document

@timetrace("LLMQueryText")
def _query_text_object(self, object: Union[Document, Element]) -> Union[Document, Element]:
def _query_text_object(
self, object: Union[Document, Element], objectPrev: Optional[Element] = None
) -> Union[Document, Element]:
if object.text_representation:
if self._format_kwargs:
prompt = (
Expand All @@ -92,10 +104,16 @@ def _query_text_object(self, object: Union[Document, Element]) -> Union[Document
.render(doc=object)
)
else:
prompt = self._prompt + "\n" + object.text_representation
if objectPrev and objectPrev.text_representation:
prompt = self._prompt + "\n" + objectPrev.text_representation + "\n\n" + object.text_representation
else:
prompt = self._prompt + "\n" + object.text_representation
prompt_kwargs = {"prompt": prompt}
llm_resp = self._llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=self._llm_kwargs)
object["properties"][self._output_property] = llm_resp
if self._table_cont:
object["properties"]["table_continuation"] = llm_resp
else:
object["properties"][self._output_property] = llm_resp
return object


Expand Down
203 changes: 201 additions & 2 deletions lib/sycamore/sycamore/transforms/merge_elements.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from abc import ABC, abstractmethod
from typing import Any, Dict
from typing import Any, Dict, Pattern, Optional
from collections import defaultdict
import re


from sycamore.data import Document, Element, BoundingBox
from sycamore.data import Document, Element, BoundingBox, Table, TableElement, TableCell
from sycamore.data.document import DocumentPropertyTypes
from sycamore.plan_nodes import SingleThreadUser, NonGPUUser, Node
from sycamore.functions.tokenizer import Tokenizer
from sycamore.transforms.map import Map
from sycamore.utils.time_trace import timetrace
from sycamore.transforms.llm_query import LLMTextQueryAgent
from sycamore.llms import LLM
from sycamore.utils.bbox_sort import bbox_sort_document


class ElementMerger(ABC):
Expand Down Expand Up @@ -412,6 +417,200 @@ def merge_elements(self, document: Document) -> Document:
return document


class TableMerger(ElementMerger):
"""
The ``Table merger`` handles 3 operations
1. If a text element (Caption, Section-header, Text...) contains the regex pattern anywhere in a page
it is attached to the text_representation of the table on the page.
2. LLMQuery is used for adding a table_continuation property to table elements. Is the table is
a continuation from a previous table the property is stored as true, else false.
3. After LLMQuery, table elements which are continuations are merged as one element.
Example:
.. code-block:: python

llm = OpenAI(OpenAIModels.GPT_4O, api_key = '')

prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine\
if the second table is a continuation of the first with 100% certainty. Check either of the following:\
1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors \
because of OCR quality) in both tables. If the headers are almost the same check the number of columns,\
they should be roughly the same. \
2. Missing headers: If the header/columns in the second table are missing, then the first row in the
second table should logically be in continutaion of the last row in the first table.\
Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \
Certainty is determined if either of the two conditions is true."

regex_pattern = r"table \d+"

merger = TableMerger(llm_prompt = prompt, llm=llm)

context = sycamore.init()
pdf_docset = context.read.binary(paths, binary_format="pdf", regex_pattern= regex_pattern)
.partition(partitioner=ArynPartitioner())
.merge(merger=merger)
"""

def __init__(
self,
regex_pattern: Optional[Pattern] = None,
llm_prompt: Optional[str] = None,
llm: Optional[LLM] = None,
*args,
**kwargs,
):
self.regex_pattern = regex_pattern
self.llm_prompt = llm_prompt
self.llm = llm

def merge_elements(self, document: Document) -> Document:

table_elements = [ele for ele in document.elements if ele.type == "table"]
if len(table_elements) < 1:
return document
if self.regex_pattern:
document.elements = self.customTableHeaderAdditionFilter(document.elements)
if not self.llm_prompt or len(table_elements) < 2:
return document
document = self.process_llm_query(document)
table_elements = [ele for ele in document.elements if ele.type == "table"]
other_elements = [ele for ele in document.elements if ele.type != "table"]
new_table_elements = [table_elements[0]]
for element in table_elements[1:]:
if self.should_merge(new_table_elements[-1], element):
new_table_elements[-1] = self.merge(new_table_elements[-1], element)
else:
new_table_elements.append(element)
other_elements.extend(new_table_elements)
document.elements = other_elements
bbox_sort_document(document)

return document

def should_merge(self, element1: Element, element2: Element) -> bool:
if "table_continuation" in element2["properties"]:
return "true" in element2["properties"]["table_continuation"].lower()
return False

def merge(self, elt1: Element, elt2: Element) -> Element:

# Check if both elements are TableElements
if not isinstance(elt1, TableElement) or not isinstance(elt2, TableElement):
raise TypeError("Both elements must be of type TableElement to perform merging.")
# Combine the cells, adjusting the row indices for the second table
if elt1.table is None or elt2.table is None:
raise ValueError("Both elements must have a table to perform merging.")

offset_row = elt1.table.num_rows
merged_cells = elt1.table.cells + [
TableCell(
content=cell.content,
rows=[r + offset_row for r in cell.rows],
cols=cell.cols,
is_header=cell.is_header,
bbox=cell.bbox,
properties=cell.properties,
)
for cell in elt2.table.cells
]

# Create a new Table object with merged cells
merged_table = Table(cells=merged_cells)

title1 = elt1.data["properties"].get("title", "") or ""
title2 = elt2.data["properties"].get("title", "") or ""
merged_title = f"{title1} / {title2}".strip(" / ")
# Create a new TableElement with the merged table and combined metadata
new_elt = TableElement(
title=merged_title if merged_title else None,
columns=elt1.columns if elt1.columns else elt2.columns,
rows=elt1.rows + elt2.rows if elt1.rows and elt2.rows else None,
table=merged_table,
tokens=elt1.tokens + elt2.tokens if elt1.tokens and elt2.tokens else None,
)

# Merge binary representations by concatenation
if elt1.binary_representation is None or elt2.binary_representation is None:
new_elt.binary_representation = elt1.binary_representation or elt2.binary_representation
else:
new_elt.binary_representation = elt1.binary_representation + elt2.binary_representation
# Merge text representations by concatenation with a newline
if elt1.text_representation is None or elt2.text_representation is None:
new_elt.text_representation = elt1.text_representation or elt2.text_representation
else:
new_elt.text_representation = elt1.text_representation + "\n" + elt2.text_representation
# Merge properties by taking the union of the keys
properties = new_elt.properties
for k, v in elt1.properties.items():
properties[k] = v
if k == DocumentPropertyTypes.PAGE_NUMBER:
properties["page_numbers"] = properties.get("page_numbers", list())
properties["page_numbers"] = list(set(properties["page_numbers"] + [v]))
for k, v in elt2.properties.items():
if properties.get(k) is None:
properties[k] = v
# if a page number exists, add it to the set of page numbers for this new element
if k == DocumentPropertyTypes.PAGE_NUMBER:
properties["page_numbers"] = properties.get("page_numbers", list())
properties["page_numbers"] = list(set(properties["page_numbers"] + [v]))

# TO-DO: Currently bbox points to first table bbox, and other bboxs are removed in
# this process, potential fix can be to have a list of bboxs, and change label
# of bbox after first as "table_continuation"
if elt1.bbox is None or elt2.bbox is None:
new_elt.bbox = elt1.bbox or elt2.bbox
else:
new_elt.bbox = BoundingBox(
elt1.bbox.x1,
elt1.bbox.y1,
elt1.bbox.x2,
elt1.bbox.y2,
)
new_elt.properties = properties

return new_elt

def customTableHeaderAdditionFilter(self, elements):

dic = defaultdict(str)

# First pass: capture headers
for ele in elements:
if ele.type in ["table", "Image", "Formula"]:
continue
elif ele.type in ["Text", "Title", "Page-header", "Section-header", "Caption"]:
if ele.text_representation is not None:
text_rep = ele.text_representation.strip()
if text_rep == "":
continue
if re.search(self.regex_pattern, text_rep):
dic[ele["properties"]["page_number"]] = text_rep + " "

# Second pass: update table elements with headers, done in separate loops since
# table headers can be within table elements as well or after them
for ele in elements:
if ele.type == "table" and isinstance(ele["table"], Table):
ele.text_representation = dic[ele["properties"]["page_number"]] + ele.text_representation
if ele["properties"]["title"]:
ele["properties"]["title"] = (
ele["properties"]["title"] + "\n" + dic[ele["properties"]["page_number"]]
)
else:
ele["properties"]["title"] = dic[ele["properties"]["page_number"]]
return elements

def process_llm_query(self, document):
# TO-DO: Add async llm query
llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True)
llm_results = llm_query_agent.execute_query(document)
return llm_results

def preprocess_element(self, elem: Element) -> Element:
return elem

def postprocess_element(self, elem: Element) -> Element:
return elem


class Merge(SingleThreadUser, NonGPUUser, Map):
"""
Merge Elements into fewer large elements
Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载