+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions lib/sycamore/sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,10 +712,8 @@ def extract_properties(self, property_extractor: PropertyExtractor, **kwargs) ->
.partition(partition=ArynPartitioner())
.extract_properties(property_extractor)
"""
from sycamore.transforms import ExtractProperties

schema = ExtractProperties(self.plan, property_extractor=property_extractor)
return DocSet(self.context, schema)
map = property_extractor.as_llm_map(self.plan, **kwargs)
return DocSet(self.context, map)

def summarize(self, summarizer: Summarizer, **kwargs) -> "DocSet":
"""
Expand Down
58 changes: 13 additions & 45 deletions lib/sycamore/sycamore/llms/prompts/default_prompts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
from abc import ABC
from typing import Any, Optional, Type
import textwrap

from sycamore.schema import Schema
from sycamore.llms.prompts.prompts import ElementListPrompt, ElementPrompt, StaticPrompt

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -277,48 +277,18 @@ class _ExtractTablePropertiesPrompt(SimplePrompt):
)


class ExtractPropertiesFromSchemaPrompt(SimplePrompt):
def __init__(self, schema: Schema, text: str):
super().__init__()

self.system = "You are given text contents from a document."
self.user = f"""
Extract values for the following fields:
{self._format_schema(schema)}

Document text:
{text}

Don't return extra information.
If you cannot find a value for a requested property, use the provided default or the value 'None'.
Return your answers as a valid json dictionary that will be parsed in python.
"""

@staticmethod
def _format_schema(schema: Schema) -> str:
text = ""
for i, field in enumerate(schema.fields):
text += f"""
{i} {field.name}: type={field.field_type}: default={field.default}
{field.description}\n
Examples values: {field.examples}

"""
return text


class PropertiesZeroShotGuidancePrompt(SimplePrompt):
def __init__(self):
super().__init__()

self.system = "You are a helpful property extractor. You only return JSON."

self.user = """You are given a few text elements of a document. Extract JSON representing one entity of
class {entity} from the document. The class only has properties {properties}. Using
this context, FIND, FORMAT, and RETURN the JSON representing one {entity}.
Only return JSON as part of your answer. If no entity is in the text, return "None".
{text}
"""
PropertiesZeroShotGuidancePrompt = ElementListPrompt(
system="You are a helpful property extractor. You only return JSON.",
user=textwrap.dedent(
"""\
You are given a few text elements of a document. Extract JSON representing one entity of
class {entity} from the document. The class only has properties {properties}. Using
this context, FIND, FORMAT, and RETURN the JSON representing one {entity}.
Only return JSON as part of your answer. If no entity is in the text, return "None".
{text}
"""
),
)


class EntityExtractorMessagesPrompt(SimplePrompt):
Expand Down Expand Up @@ -413,8 +383,6 @@ def __init__(self, field: str, groups: list[str]):
"TEXT_SUMMARIZER_GUIDANCE_PROMPT_CHAT": _TextSummarizerGuidancePrompt,
"SCHEMA_ZERO_SHOT_GUIDANCE_PROMPT": _SchemaZeroShotGuidancePrompt,
"SCHEMA_ZERO_SHOT_GUIDANCE_PROMPT_CHAT": _SchemaZeroShotGuidancePrompt,
"PROPERTIES_ZERO_SHOT_GUIDANCE_PROMPT": PropertiesZeroShotGuidancePrompt,
"PROPERTIES_ZERO_SHOT_GUIDANCE_PROMPT_CHAT": PropertiesZeroShotGuidancePrompt,
}


Expand Down
8 changes: 4 additions & 4 deletions lib/sycamore/sycamore/llms/prompts/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def render_document(self, doc: Document) -> RenderedPrompt:
A two-message RenderedPrompt containing ``self.system.format()`` and ``self.user.format()``
using the format keys as specified above.
"""
format_args = self.kwargs
format_args["doc_text"] = doc.text_representation
format_args = copy.deepcopy(self.kwargs)
format_args["doc_text"] = doc.text_representation or self._render_element_list_to_string(doc)
flat_props = flatten_data(doc.properties, prefix="doc_property", separator="_")
format_args.update(flat_props)
format_args["elements"] = self._render_element_list_to_string(doc)
Expand Down Expand Up @@ -304,7 +304,7 @@ def render_document(self, doc: Document) -> RenderedPrompt:
"""
i = doc.properties.get(self.iteration_var_name, 0)

format_args = self.kwargs
format_args = copy.deepcopy(self.kwargs)
format_args["doc_text"] = doc.text_representation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this used for anything? in other places too

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for element based prompts, shouldn't you only be using element text

Copy link
Collaborator Author

@HenryL27 HenryL27 Feb 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically ElementListIterPrompt is a document prompt - so this lets you go "Actually I don't want this prompt to care about the element text" and do something like

newprompt = oldprompt.set(user="""\
    Extract the {entity} from the text. This is your {doc_property_i}th try.
    Text Representation:
    {doc_text}
""")

without having to change the class or implementation of the object. It doesn't necessarily get used but it should be available to be used if the user want imo

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah

flat_props = flatten_data(doc.properties, prefix="doc_property", separator="_")
format_args.update(flat_props)
Expand Down Expand Up @@ -394,7 +394,7 @@ def render_element(self, elt: Element, doc: Document) -> RenderedPrompt:
of the PDF it's on and attach it to the last message (user message if there
is one, o/w system message).
"""
format_args = self.kwargs
format_args = copy.deepcopy(self.kwargs)
format_args.update(self.capture_parent_context(doc, elt))
format_args["elt_text"] = elt.text_representation
flat_props = flatten_data(elt.properties, prefix="elt_property", separator="_")
Expand Down
11 changes: 6 additions & 5 deletions lib/sycamore/sycamore/tests/unit/test_docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@
Partition,
ExtractSchema,
ExtractBatchSchema,
ExtractProperties,
Query,
)
from sycamore.transforms import Filter
from sycamore.transforms.base import get_name_from_callable
from sycamore.transforms.base_llm import LLMMap
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.extract_schema import SchemaExtractor
from sycamore.transforms.extract_schema import SchemaExtractor, LLMPropertyExtractor
from sycamore.transforms.query import QueryExecutor
from sycamore.transforms.similarity import SimilarityScorer
from sycamore.transforms.sort import Sort
Expand Down Expand Up @@ -278,10 +277,12 @@ def test_extract_batch_schema(self, mocker):

def test_extract_properties(self, mocker):
context = mocker.Mock(spec=Context)
func = mocker.Mock(spec=Callable, extract_properties=lambda d: {})
pe = LLMPropertyExtractor(llm=MockLLM())
docset = DocSet(context, None)
docset = docset.extract_properties(func)
assert isinstance(docset.lineage(), ExtractProperties)
docset = docset.extract_properties(property_extractor=pe)
assert isinstance(docset.lineage(), Map)
docset_back_one = DocSet(context, docset.plan.children[0])
assert isinstance(docset_back_one.lineage(), LLMMap)

def test_take_all(self):
num_docs = 30
Expand Down
118 changes: 108 additions & 10 deletions lib/sycamore/sycamore/tests/unit/transforms/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from sycamore.data import Document, Element
from sycamore.llms.llms import LLM, FakeLLM
from sycamore.schema import Schema, SchemaField
from sycamore.transforms.base_llm import LLMMap
from sycamore.transforms.map import Map
from sycamore.transforms.extract_schema import ExtractBatchSchema, SchemaExtractor
from sycamore.transforms.extract_schema import LLMSchemaExtractor, LLMPropertyExtractor
from sycamore.utils.ray_utils import check_serializable
Expand Down Expand Up @@ -98,7 +100,7 @@ def test_extract_batch_schema(self, mocker):

def test_extract_properties(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate_old")
generate = mocker.patch.object(llm, "generate")
generate.return_value = '```json {"accidentNumber": "FTW95FA129", "location": "Fort Worth, TX"}```'

doc = Document()
Expand All @@ -116,15 +118,58 @@ def test_extract_properties(self, mocker):
}

property_extractor = LLMPropertyExtractor(llm)
doc = property_extractor.extract_properties(doc)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

# doc = property_extractor.extract_properties(doc)

assert doc.properties["entity"]["weather"] == "sunny"
assert doc.properties["AircraftIncident"]["accidentNumber"] == "FTW95FA129"
assert doc.properties["AircraftIncident"]["location"] == "Fort Worth, TX"

def test_extract_properties_default_to_entity(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate")
generate.return_value = '```json {"accidentNumber": "FTW95FA129", "location": "Fort Worth, TX"}```'

doc = Document()
element1 = Element()
element1.text_representation = "".join(random.choices(string.ascii_letters, k=10))
element2 = Element()
element2.text_representation = "".join(random.choices(string.ascii_letters, k=20))
doc.elements = [element1, element2]
doc.properties = {
"_schema": {
"accidentNumber": "string",
},
"entity": {"weather": "sunny"},
}

property_extractor = LLMPropertyExtractor(llm)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

# doc = property_extractor.extract_properties(doc)

assert doc.properties["entity"]["weather"] == "sunny"
assert doc.properties["entity"]["accidentNumber"] == "FTW95FA129"
assert doc.properties["entity"]["location"] == "Fort Worth, TX"

def test_extract_properties_explicit_json(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate_old")
generate = mocker.patch.object(llm, "generate")
generate.return_value = '{"accidentNumber": "FTW95FA129"}'

doc = Document()
Expand All @@ -141,13 +186,50 @@ def test_extract_properties_explicit_json(self, mocker):
}

property_extractor = LLMPropertyExtractor(llm)
doc = property_extractor.extract_properties(doc)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

assert doc.properties["entity"]["accidentNumber"] == "FTW95FA129"
docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

assert doc.properties["AircraftIncident"]["accidentNumber"] == "FTW95FA129"

def test_extract_properties_llm_say_none(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate")
generate.return_value = "None"

doc = Document()
element1 = Element()
element1.text_representation = "".join(random.choices(string.ascii_letters, k=10))
element2 = Element()
element2.text_representation = "".join(random.choices(string.ascii_letters, k=20))
doc.elements = [element1, element2]
doc.properties = {
"_schema": {
"accidentNumber": "string",
},
"_schema_class": "AircraftIncident",
}

property_extractor = LLMPropertyExtractor(llm)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

assert len(doc.properties["AircraftIncident"]) == 0

def test_extract_properties_fixed_json(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate_old")
generate = mocker.patch.object(llm, "generate")
generate.return_value = '{"accidentNumber": "FTW95FA129"}'

doc = Document()
Expand All @@ -160,13 +242,22 @@ def test_extract_properties_fixed_json(self, mocker):
property_extractor = LLMPropertyExtractor(
llm, schema_name="AircraftIncident", schema={"accidentNumber": "string"}
)
doc = property_extractor.extract_properties(doc)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

assert doc.properties["entity"]["accidentNumber"] == "FTW95FA129"
docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

assert doc.properties["AircraftIncident"]["accidentNumber"] == "FTW95FA129"

# assert doc.properties["entity"]["accidentNumber"] == "FTW95FA129"

def test_extract_properties_with_schema(self, mocker):
llm = mocker.Mock(spec=LLM)
generate = mocker.patch.object(llm, "generate_old")
generate = mocker.patch.object(llm, "generate")
generate.return_value = (
'{"startDate": "2022-01-22 00:01:31", '
'"endDate": "2022-01-24 00:01:59", '
Expand All @@ -193,7 +284,14 @@ def test_extract_properties_with_schema(self, mocker):
]
)
property_extractor = LLMPropertyExtractor(llm, schema=schema)
doc = property_extractor.extract_properties(doc)
pe_map = property_extractor.as_llm_map(None)
assert len(pe_map.children) == 1
pe_llm_map = pe_map.children[0]
assert isinstance(pe_llm_map, LLMMap)
assert isinstance(pe_map, Map)

docs = pe_llm_map.run([doc])
doc = pe_map.run(docs[0])

assert doc.properties["entity"]["accidentNumber"] == "FTW95FA129"
assert doc.properties["entity"]["startDate"] == datetime.datetime(2022, 1, 22, 0, 1, 31)
Expand Down
2 changes: 0 additions & 2 deletions lib/sycamore/sycamore/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
ExtractSchema,
ExtractBatchSchema,
SchemaExtractor,
ExtractProperties,
PropertyExtractor,
)
from sycamore.transforms.random_sample import RandomSample
Expand Down Expand Up @@ -83,7 +82,6 @@
"ExtractBatchSchema",
"SchemaExtractor",
"PropertyExtractor",
"ExtractProperties",
"RandomSample",
"SplitElements",
"Query",
Expand Down
Loading
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载