+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 77 additions & 1 deletion .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,45 @@ jobs:
run: poetry run pytest remote_processors/test/unit/
working-directory: lib/remote-processors

notebook-tests-fast:
runs-on: integ-test-runner2
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: ./notebooks/run-notebook-tests.sh --fast

integ-tests:
runs-on: integ-test-runner2
strategy:
Expand Down Expand Up @@ -179,5 +218,42 @@ jobs:
run: sudo chmod -R 777 /neo4j/import
- name: Run Integ tests
run: poetry run pytest lib/sycamore/sycamore/tests/integration

notebook-tests-slow:
runs-on: integ-test-runner2
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200

steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: poetry run pytest --nbmake --nbmake-timeout=600 notebooks/default-prep-script.ipynb notebooks/jupyter_dev_example.ipynb notebooks/metadata-extraction.ipynb notebooks/sycamore_demo.ipynb notebooks/tutorial.ipynb
run: ./notebooks/run-notebook-tests.sh --slow
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ notebooks/default-prep-data
.python-version
apps/timetrace/ttviz
notebooks/pc-tutorial
notebooks/tmp
30 changes: 29 additions & 1 deletion lib/sycamore/sycamore/connectors/file/file_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,15 @@ def __init__(
**resource_args,
):
super().__init__(**resource_args)
assert len(paths) > 0
if isinstance(paths, str):
paths = [paths]
assert isinstance(paths, list)
self._paths = paths
self._filesystem = filesystem
if self._filesystem is None:
self._try_infer_fs()

assert parallelism is None, "Use override_num_blocks; remove parameter after 2025-03-01"
self.override_num_blocks = override_num_blocks

Expand All @@ -95,6 +102,28 @@ def _is_s3_scheme(self) -> bool:
else:
return all(path.startswith("s3:") for path in self._paths)

def _try_infer_fs(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems related/overlapping with #925. Are they going to conflict?

from sycamore.utils.pyarrow import infer_fs

common_fs = None
new_paths = []
for p in self._paths:
(fs, root) = infer_fs(p)
new_paths.append(root)
if common_fs is None:
common_fs = fs
if not isinstance(fs, common_fs.__class__):
logger.warning(
f"Different paths infer multiple filesystems. {self._paths[0]}"
+ f" gives {common_fs.__class__.__name__} and {p} gives"
+ f" {fs.__class__.__name__}. Using no fs and hoping."
)
return

assert common_fs is not None
self._filesystem = common_fs
self._paths = new_paths

@abstractmethod
def process_file(self, file_info: FileInfo) -> list[Document]:
pass
Expand Down Expand Up @@ -155,7 +184,6 @@ def __init__(
filesystem=filesystem,
**resource_args,
)
self._paths = paths
self._binary_format = binary_format
self._metadata_provider = metadata_provider
self._filter_paths_by_extension = filter_paths_by_extension
Expand Down
1 change: 1 addition & 0 deletions lib/sycamore/sycamore/materialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def _maybe_anonymous(self):
fs.get_file_info(str(self._root))
self._fs = fs
self._fshelper = _PyArrowFsHelper(self._fs)
logger.info(f"Successfully read path {self._root} with anonymous S3")
return
except OSError as e:
logging.warning(
Expand Down
28 changes: 28 additions & 0 deletions lib/sycamore/sycamore/utils/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,37 @@ def infer_fs(path: str) -> Tuple[FileSystem, str]:
from pyarrow import fs

(fs, root) = fs.FileSystem.from_uri(str(path))

fs = maybe_use_anonymous_s3_fs(fs, root)

return (fs, root)


def maybe_use_anonymous_s3_fs(fs: FileSystem, root: str) -> FileSystem:
from pyarrow.fs import S3FileSystem

if not isinstance(fs, S3FileSystem):
return fs

try:
fs.get_file_info(root)
return fs
except OSError as e:
logger.warning(f"Got error {e} trying to get file info on {root}, trying again in anonymous mode")

new_fs = S3FileSystem(anonymous=True)
try:
new_fs.get_file_info(root)
logger.info(f"Successfully read path {root} with anonymous S3")
return new_fs
except OSError as e:
logger.warning(
f"Got error {e} trying to get file info on {root} in anonymous mode. Using previous, non-working fs"
)

return fs


def cross_check_infer_fs(filesystem: Optional[FileSystem], path: str) -> Tuple[FileSystem, str]:
if filesystem is None:
return infer_fs(path)
Expand Down
File renamed without changes.
8 changes: 3 additions & 5 deletions notebooks/ndd_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@
"metadata": {},
"outputs": [],
"source": [
"parallelism = max(2, multiprocessing.cpu_count() // 2) if save_resources else -1\n",
"\n",
"tokenizer = HuggingFaceTokenizer('thenlper/gte-small')\n",
"embedder = SentenceTransformerEmbedder(model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=100)\n",
"\n",
Expand All @@ -157,12 +155,12 @@
"if use_json:\n",
" # Fast way: pre-processed DocSet as JSON...\n",
" path = 's3://aryn-public/cccmad-json'\n",
" ds = ctx.read.json_document(path, filesystem=fsys, parallelism=parallelism)\n",
" ds = ctx.read.json_document(path, filesystem=fsys)\n",
"else:\n",
" # Slow way: process PDF documents via Sycamore pipeline...\n",
" path = 's3://aryn-public/cccmad'\n",
" ds = (\n",
" ctx.read.binary(path, binary_format='pdf', filesystem=fsys, parallelism=parallelism)\n",
" ctx.read.binary(path, binary_format='pdf', filesystem=fsys)\n",
" .partition(partitioner=ArynPartitioner())\n",
" .regex_replace(COALESCE_WHITESPACE)\n",
" .mark_bbox_preset(tokenizer=tokenizer)\n",
Expand Down Expand Up @@ -342,7 +340,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
112 changes: 112 additions & 0 deletions notebooks/run-notebook-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash
main() {
config
cd notebooks
if [[ "$1" == --fast ]]; then
echo "Testing fast notebooks..."
local start=$(date +%s)
test_notebooks "${FAST_NOTEBOOKS[@]}"
local end=$(date +%s)
local elapsed=$(expr $end - $start)
echo "-------------------------------------------------------"
echo "--- Testing fast notebooks took $elapsed/$FAST_MAXTIME seconds"
if [[ $elapsed -ge $FAST_MAXTIME ]]; then
echo "--- Fast tests took too long."
echo "--- Move some fast tests to slow,"
echo "--- or speed up the fast notebooks."
exit 1
fi
echo "-------------------------------------------------------"
check_coverage
elif [[ "$1" == --slow ]]; then
echo "Testing slow notebooks..."
test_notebooks "${SLOW_NOTEBOOKS[@]}"
else
echo "Usage: $0 {--fast,--slow}"
exit 1
fi
exit 0
}

config() {
FAST_NOTEBOOKS=(
default-prep-script.ipynb # 20+40s on Eric's laptop.
OpenAI-logprob.ipynb # 5s on Eric's laptop.
)

FAST_MAXTIME=120
SLOW_NOTEBOOKS=(
ArynPartitionerExample.ipynb # Gets a crash message, but passes?
ArynPartitionerPython.ipynb
jupyter_dev_example.ipynb # O(n^2) from re-show/take
metadata-extraction.ipynb # processes entire
sycamore_demo.ipynb # O(n^2) from re-show/take
tutorial.ipynb # aryn partitioner on entire sort benchmark
VisualizePartitioner.ipynb
)
EXCLUDE_NOTEBOOKS=(
# No good reason for exclusion, just what was
# not automatically tested before
ArynPartitionerWithLangchain.ipynb # needs langchain to be installed
duckdb-writer.ipynb # timeout
elasticsearch-writer.ipynb # needs elasticsearch db setup
financial-docs-10k-example.ipynb # needs to be fixed like
# tutorial to have assertion, assertion code needs to move to aryn_sdk
ndd_example.ipynb #broken
ntsb-demo.ipynb # needs rps
opensearch_docs_etl.ipynb # timeout
opensearch-writer.ipynb # depends on langchain
pinecone-writer.ipynb # needs pinccone key
query-demo.ipynb # depnds on un-checked in visualize.py
subtask-sample.ipynb # broken -- old style llm prompts
sycamore-tutorial-intermediate-etl.ipynb # timeout
unpickle_query.ipynb # looking for uncommitted file
weaviate-writer.ipynb # path not set to files
)
}

test_notebooks() {
for i in "$@"; do
echo
echo
echo "-------------------------------------------------------------------------"
echo "Starting test on $i as written"
echo "-------------------------------------------------------------------------"

time poetry run pytest --nbmake --nbmake-timeout=600 $i || exit 1

if [[ $(grep -c sycamore.EXEC_LOCAL $i) -ge 1 ]]; then
sed -e 's/sycamore.EXEC_LOCAL/sycamore.EXEC_RAY/' <$i >ray-variant-$i
echo "-------------------------------------------------------------------------"
echo "Starting test on $i with EXEC_RAY"
echo "-------------------------------------------------------------------------"
time poetry run pytest --nbmake --nbmake-timeout=600 ray-variant-$i || exit 1
rm ray-variant-$i
fi
done
}

check_coverage() {
echo "Verifying coverage of all notebooks..."
ls *.ipynb | grep -v '^ray-variant-' >/tmp/notebooks.list
(
for i in "${FAST_NOTEBOOKS[@]}" "${SLOW_NOTEBOOKS[@]}" "${EXCLUDE_NOTEBOOKS[@]}"; do
echo "$i"
done
) >>/tmp/notebooks.list
sort /tmp/notebooks.list | uniq -c | sort -n | grep -v '^ 2 ' >/tmp/notebooks.unlisted
if [[ $(wc -l </tmp/notebooks.unlisted) != 0 ]]; then
echo "ERROR: some notebooks are unlisted."
echo " Add them to FAST_NOTEBOOKS for fast tests that block commits,"
echo " SLOW_NOTEBOOKS for slower tests that do not block commits,"
echo " or EXCLUDE_NOTEBOOKS for notebooks that should not be automatically tested."
echo " Missing noteboos:"
sed 's/^ 1 //' /tmp/notebooks.unlisted
exit 1
fi
rm /tmp/notebooks.list /tmp/notebooks.unlisted
echo "All notebooks are classified as fast, slow or excluded."
}

main "$@"
exit 1
15 changes: 11 additions & 4 deletions notebooks/tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"context = sycamore.init()\n",
"\n",
"# Creating a DocSet\n",
"docset = context.read.binary(paths, parallelism=1, binary_format=\"pdf\")"
"docset = context.read.binary(paths, binary_format=\"pdf\")"
]
},
{
Expand All @@ -45,7 +45,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
"If the above assertion fails, you need to configure your Aryn key. If you do not have one already, visit https://www.aryn.ai/cloud to get a key.\n",
"\n",
"To configure the key, you can either set the environment variable ARYN_API_KEY and restart jupyter\n",
"or make a yaml file at the specified path in the assertion error that looks like:\n",
"\n",
"```\n",
Expand All @@ -71,7 +73,12 @@
"\n",
"# We are using ArynPartitioner to partion the documents.\n",
"# Sycamore supports pluggable partitioners for different formats.\n",
"docset = docset.partition(partitioner=ArynPartitioner())"
"docset = docset.partition(partitioner=ArynPartitioner())\n",
"\n",
"# Now save the docset after partitioning so we can re-use computations.\n",
"docset = docset.materialize(\"tmp/tutorial/post-partition\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
"# Run the docset to complete the partitioning.\n",
"docset.execute()\n"
]
},
{
Expand Down Expand Up @@ -229,7 +236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down
Loading
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载