+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions lib/sycamore/sycamore/data/table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import OrderedDict
from dataclasses import dataclass, field
from typing import Any, Optional, TypeVar, Union
from typing import Any, Optional, TypeVar, Union, List
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup, Tag
Expand Down Expand Up @@ -99,7 +99,9 @@ class Table:
are lossy, since, for instance, CSV does not natively support spanning cells.
"""

def __init__(self, cells: list[TableCell], caption: Optional[str] = None):
def __init__(
self, cells: list[TableCell], caption: Optional[str] = None, column_headers: Optional[list[str]] = None
):
"""Creates a new Table.

Args:
Expand All @@ -112,6 +114,11 @@ def __init__(self, cells: list[TableCell], caption: Optional[str] = None):
self.num_rows = max(max(c.rows) for c in self.cells) + 1
self.num_cols = max(max(c.cols) for c in self.cells) + 1

if column_headers is not None:
self.column_headers = column_headers
else:
self.column_headers = self.to_pandas(column_header_only=True)

def __eq__(self, other):
if type(other) is not type(self):
return False
Expand Down Expand Up @@ -248,7 +255,7 @@ def from_html(cls, html_str: Optional[str] = None, html_tag: Optional[Tag] = Non
# we speculate that duplication may create confusion, so we default to only displaying a cells
# content for the first row/column for which it is applicable. The exception is for header rows,
# where we duplicate values to each columnn to ensure that every column has a fully qualified header.
def to_pandas(self) -> DataFrame:
def to_pandas(self, column_header_only: bool = False) -> Union[DataFrame, List[str]]:
"""Returns this table as a Pandas DataFrame.

For example, Suppose a cell spans row 2-3 and columns 4-5.
Expand Down Expand Up @@ -286,20 +293,22 @@ def to_pandas(self) -> DataFrame:
table_array[row, col] = ""

else:
for row in cell.rows:
for col in cell.cols:
if row == cell.rows[0] and col == cell.cols[0]:
table_array[row, col] = cell.content
else:
table_array[row, col] = ""
if not column_header_only:
for row in cell.rows:
for col in cell.cols:
if row == cell.rows[0] and col == cell.cols[0]:
table_array[row, col] = cell.content
else:
table_array[row, col] = ""

header = table_array[: max_header_prefix_row + 1, :]

flattened_header = []

for npcol in header.transpose():
flattened_header.append(" | ".join(OrderedDict.fromkeys((c for c in npcol if c not in [None, ""]))))

if column_header_only:
return flattened_header
df = DataFrame(
table_array[max_header_prefix_row + 1 :, :],
index=None,
Expand All @@ -320,7 +329,9 @@ def to_csv(self, **kwargs) -> str:

pandas_kwargs = {"index": False, "header": has_header}
pandas_kwargs.update(kwargs)
return self.to_pandas().to_csv(**pandas_kwargs)
df = self.to_pandas(column_header_only=False)
assert isinstance(df, DataFrame), "Expected `to_pandas` to return a DataFrame"
return df.to_csv(**pandas_kwargs)

def to_html(self, pretty=False, wrap_in_html=False, style=DEFAULT_HTML_STYLE):
"""Converts this table to an HTML string.
Expand Down
2 changes: 1 addition & 1 deletion lib/sycamore/sycamore/transforms/merge_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def merge(self, elt1: Element, elt2: Element) -> Element:
]

# Create a new Table object with merged cells
merged_table = Table(cells=merged_cells)
merged_table = Table(cells=merged_cells, column_headers=elt1.table.column_headers)

title1 = elt1.data["properties"].get("title", "") or ""
title2 = elt2.data["properties"].get("title", "") or ""
Expand Down
95 changes: 60 additions & 35 deletions lib/sycamore/sycamore/transforms/split_elements.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Optional
import logging
from sycamore.data import Document, Element
from sycamore.data import Document, Element, TableElement
from sycamore.functions.tokenizer import Tokenizer
from sycamore.plan_nodes import Node, SingleThreadUser, NonGPUUser
from sycamore.transforms.map import Map
Expand Down Expand Up @@ -58,51 +58,76 @@ def split_one(elem: Element, tokenizer: Tokenizer, max: int) -> list[Element]:
left = half
right = half + 1

# FIXME: make this work with asian languages
predicates = [ # in precedence order
lambda c: c in ".!?",
lambda c: c == ";",
lambda c: c in "()",
lambda c: c == ":",
lambda c: c == ",",
str.isspace,
]
results: list[Optional[int]] = [None] * len(predicates)

for jj in range(half // 2): # stay near middle; avoid the ends
lchar = txt[left]
rchar = txt[right]

go = True
for ii, predicate in enumerate(predicates):
if predicate(lchar):
if results[ii] is None:
results[ii] = left
go = ii != 0
# FIXME: The table object in the split elements would have the whole table structure rather than split
newlineFound = False
if elem.type == "table":
for jj in range(half // 2):
if txt[left] == "\n":
idx = left + 1
newlineFound = True
break
elif predicate(rchar):
if results[ii] is None:
results[ii] = right
go = ii != 0
elif txt[right] == "\n":
idx = right + 1
newlineFound = True
break
if not go:
break
left -= 1
right += 1

left -= 1
right += 1
# FIXME: make this work with asian languages
if not newlineFound:
left = half
right = half + 1
predicates = [ # in precedence order
lambda c: c in ".!?",
lambda c: c == ";",
lambda c: c in "()",
lambda c: c == ":",
lambda c: c == ",",
str.isspace,
]
results: list[Optional[int]] = [None] * len(predicates)

for jj in range(half // 2): # stay near middle; avoid the ends
lchar = txt[left]
rchar = txt[right]

go = True
for ii, predicate in enumerate(predicates):
if predicate(lchar):
if results[ii] is None:
results[ii] = left
go = ii != 0
break
elif predicate(rchar):
if results[ii] is None:
results[ii] = right
go = ii != 0
break
if not go:
break

idx = half + 1
for res in results:
if res is not None:
idx = res + 1
break
left -= 1
right += 1

idx = half + 1
for res in results:
if res is not None:
idx = res + 1
break

one = txt[:idx]
two = txt[idx:]

ment = elem.copy()
elem.text_representation = one
elem.binary_representation = bytes(one, "utf-8")
if elem.type == "table":
if not isinstance(elem, TableElement) or elem.table is None:
raise ValueError("Element must be tableElement/ have table to perform splitting.")
if elem.table.column_headers:
two = ", ".join(elem.table.column_headers) + "\n" + two
if elem.data["properties"].get("title"):
two = elem.data["properties"].get("title") + "\n" + two
if elem.get("_header"):
ment.text_representation = ment["_header"] + "\n" + two
else:
Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载