aryn-ai · dhruvkaliraman7 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/lib/sycamore/sycamore/data/table.py b/lib/sycamore/sycamore/data/table.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field
-from typing import Any, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union, List
 import xml.etree.ElementTree as ET
 
 from bs4 import BeautifulSoup, Tag
@@ -99,7 +99,9 @@ class Table:
     are lossy, since, for instance, CSV does not natively support spanning cells.
     """
 
-    def __init__(self, cells: list[TableCell], caption: Optional[str] = None):
+    def __init__(
+        self, cells: list[TableCell], caption: Optional[str] = None, column_headers: Optional[list[str]] = None
+    ):
         """Creates a new Table.
 
         Args:
@@ -112,6 +114,11 @@ def __init__(self, cells: list[TableCell], caption: Optional[str] = None):
         self.num_rows = max(max(c.rows) for c in self.cells) + 1
         self.num_cols = max(max(c.cols) for c in self.cells) + 1
 
+        if column_headers is not None:
+            self.column_headers = column_headers
+        else:
+            self.column_headers = self.to_pandas(column_header_only=True)
+
     def __eq__(self, other):
         if type(other) is not type(self):
             return False
@@ -248,7 +255,7 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
     # we speculate that duplication may create confusion, so we default to only displaying a cells
     # content for the first row/column for which it is applicable. The exception is for header rows,
     # where we duplicate values to each columnn to ensure that every column has a fully qualified header.
-    def to_pandas(self) -> DataFrame:
+    def to_pandas(self, column_header_only: bool = False) -> Union[DataFrame, List[str]]:
         """Returns this table as a Pandas DataFrame.
 
         For example, Suppose a cell spans row 2-3 and columns 4-5.
@@ -286,20 +293,22 @@ def to_pandas(self) -> DataFrame:
                             table_array[row, col] = ""
 
                 else:
-                    for row in cell.rows:
-                        for col in cell.cols:
-                            if row == cell.rows[0] and col == cell.cols[0]:
-                                table_array[row, col] = cell.content
-                            else:
-                                table_array[row, col] = ""
+                    if not column_header_only:
+                        for row in cell.rows:
+                            for col in cell.cols:
+                                if row == cell.rows[0] and col == cell.cols[0]:
+                                    table_array[row, col] = cell.content
+                                else:
+                                    table_array[row, col] = ""
 
         header = table_array[: max_header_prefix_row + 1, :]
 
         flattened_header = []
 
         for npcol in header.transpose():
             flattened_header.append(" | ".join(OrderedDict.fromkeys((c for c in npcol if c not in [None, ""]))))
-
+        if column_header_only:
+            return flattened_header
         df = DataFrame(
             table_array[max_header_prefix_row + 1 :, :],
             index=None,
@@ -320,7 +329,9 @@ def to_csv(self, **kwargs) -> str:
 
         pandas_kwargs = {"index": False, "header": has_header}
         pandas_kwargs.update(kwargs)
-        return self.to_pandas().to_csv(**pandas_kwargs)
+        df = self.to_pandas(column_header_only=False)
+        assert isinstance(df, DataFrame), "Expected `to_pandas` to return a DataFrame"
+        return df.to_csv(**pandas_kwargs)
 
     def to_html(self, pretty=False, wrap_in_html=False, style=DEFAULT_HTML_STYLE):
         """Converts this table to an HTML string.

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -517,7 +517,7 @@ def merge(self, elt1: Element, elt2: Element) -> Element:
         ]
 
         # Create a new Table object with merged cells
-        merged_table = Table(cells=merged_cells)
+        merged_table = Table(cells=merged_cells, column_headers=elt1.table.column_headers)
 
         title1 = elt1.data["properties"].get("title", "") or ""
         title2 = elt2.data["properties"].get("title", "") or ""

diff --git a/lib/sycamore/sycamore/transforms/split_elements.py b/lib/sycamore/sycamore/transforms/split_elements.py
@@ -1,6 +1,6 @@
 from typing import Optional
 import logging
-from sycamore.data import Document, Element
+from sycamore.data import Document, Element, TableElement
 from sycamore.functions.tokenizer import Tokenizer
 from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
 from sycamore.transforms.map import Map
@@ -58,51 +58,76 @@ def split_one(elem: Element, tokenizer: Tokenizer, max: int) -> list[Element]:
         left = half
         right = half + 1
 
-        # FIXME: make this work with asian languages
-        predicates = [  # in precedence order
-            lambda c: c in ".!?",
-            lambda c: c == ";",
-            lambda c: c in "()",
-            lambda c: c == ":",
-            lambda c: c == ",",
-            str.isspace,
-        ]
-        results: list[Optional[int]] = [None] * len(predicates)
-
-        for jj in range(half // 2):  # stay near middle; avoid the ends
-            lchar = txt[left]
-            rchar = txt[right]
-
-            go = True
-            for ii, predicate in enumerate(predicates):
-                if predicate(lchar):
-                    if results[ii] is None:
-                        results[ii] = left
-                    go = ii != 0
+        # FIXME: The table object in the split elements would have the whole table structure rather than split
+        newlineFound = False
+        if elem.type == "table":
+            for jj in range(half // 2):
+                if txt[left] == "\n":
+                    idx = left + 1
+                    newlineFound = True
                     break
-                elif predicate(rchar):
-                    if results[ii] is None:
-                        results[ii] = right
-                    go = ii != 0
+                elif txt[right] == "\n":
+                    idx = right + 1
+                    newlineFound = True
                     break
-            if not go:
-                break
+                left -= 1
+                right += 1
 
-            left -= 1
-            right += 1
+        # FIXME: make this work with asian languages
+        if not newlineFound:
+            left = half
+            right = half + 1
+            predicates = [  # in precedence order
+                lambda c: c in ".!?",
+                lambda c: c == ";",
+                lambda c: c in "()",
+                lambda c: c == ":",
+                lambda c: c == ",",
+                str.isspace,
+            ]
+            results: list[Optional[int]] = [None] * len(predicates)
+
+            for jj in range(half // 2):  # stay near middle; avoid the ends
+                lchar = txt[left]
+                rchar = txt[right]
+
+                go = True
+                for ii, predicate in enumerate(predicates):
+                    if predicate(lchar):
+                        if results[ii] is None:
+                            results[ii] = left
+                        go = ii != 0
+                        break
+                    elif predicate(rchar):
+                        if results[ii] is None:
+                            results[ii] = right
+                        go = ii != 0
+                        break
+                if not go:
+                    break
 
-        idx = half + 1
-        for res in results:
-            if res is not None:
-                idx = res + 1
-                break
+                left -= 1
+                right += 1
+
+            idx = half + 1
+            for res in results:
+                if res is not None:
+                    idx = res + 1
+                    break
 
         one = txt[:idx]
         two = txt[idx:]
 
         ment = elem.copy()
         elem.text_representation = one
         elem.binary_representation = bytes(one, "utf-8")
+        if elem.type == "table":
+            if not isinstance(elem, TableElement) or elem.table is None:
+                raise ValueError("Element must be tableElement/ have table to perform splitting.")
+            if elem.table.column_headers:
+                two = ", ".join(elem.table.column_headers) + "\n" + two
+            if elem.data["properties"].get("title"):
+                two = elem.data["properties"].get("title") + "\n" + two
         if elem.get("_header"):
             ment.text_representation = ment["_header"] + "\n" + two
         else: