NVIDIA-NeMo · suiyoubi · Jul 22, 2025
diff --git a/ray-curator/ray_curator/examples/video/video_split_clip_example.py b/ray-curator/ray_curator/examples/video/video_split_clip_example.py
@@ -8,6 +8,7 @@
 from ray_curator.stages.video.clipping.transnetv2_extraction import TransNetV2ClipExtractionStage
 from ray_curator.stages.video.clipping.video_frame_extraction import VideoFrameExtractionStage
 from ray_curator.stages.video.embedding.cosmos_embed1 import CosmosEmbed1EmbeddingStage, CosmosEmbed1FrameCreationStage
+from ray_curator.stages.video.filtering.clip_aesthetic_filter import ClipAestheticFilterStage
 from ray_curator.stages.video.filtering.motion_filter import MotionFilterStage, MotionVectorDecodeStage
 from ray_curator.stages.video.io.clip_writer import ClipWriterStage
 from ray_curator.stages.video.io.video_reader_download import VideoReaderDownloadStage
@@ -108,6 +109,15 @@ def create_video_splitting_pipeline(args: argparse.Namespace) -> Pipeline:
             verbose=args.verbose,
         ))
 
+    if args.aesthetic_threshold is not None:
+        pipeline.add_stage(ClipAestheticFilterStage(
+            model_dir=args.model_dir,
+            score_threshold=args.aesthetic_threshold,
+            reduction=args.aesthetic_reduction,
+            num_gpus_per_worker=args.aesthetic_gpus_per_worker,
+            verbose=args.verbose,
+        ))
+
     if args.generate_embeddings and args.embedding_algorithm.startswith("cosmos-embed1"):
         variant = args.embedding_algorithm.split("-")[-1]
         pipeline.add_stage(CosmosEmbed1FrameCreationStage(
@@ -405,6 +415,21 @@ def main(args: argparse.Namespace) -> None:
         default=None,
         help="If specified (e.g. 3.5), filter out clips with an aesthetic score below this threshold.",
     )
+    parser.add_argument(
+        "--aesthetic-reduction",
+        choices=[
+            "mean",
+            "min",
+        ],
+        default="min",
+        help="Method to reduce the frame-level aesthetic scores.",
+    )
+    parser.add_argument(
+        "--aesthetic-gpus-per-worker",
+        type=float,
+        default=0.25,
+        help="Number of GPUs per worker allocated to aesthetic filter.",
+    )
     # Embedding arguments
     parser.add_argument(
         "--embedding-algorithm",

diff --git a/ray-curator/ray_curator/models/aesthetics.py b/ray-curator/ray_curator/models/aesthetics.py
@@ -0,0 +1,115 @@
+"""Model Aesthetics."""
+
+from pathlib import Path
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from safetensors.torch import load_file
+from torch import nn
+
+from .base import ModelInterface
+
+_AESTHETICS_MODEL_ID = "ttj/sac-logos-ava1-l14-linearMSE"
+
+
+class MLP(nn.Module):
+    """Multi-layer perceptron.
+
+    A neural network that processes embeddings to predict aesthetic scores.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the MLP.
+
+        Args:
+            None
+
+        """
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+
+    @torch.no_grad()
+    def forward(self, embed: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the MLP.
+
+        Args:
+            embed: Input embeddings tensor.
+
+        Returns:
+            Predicted aesthetic scores.
+
+        """
+        return self.layers(embed)  # type: ignore[no-any-return]
+
+
+class AestheticScorer(ModelInterface):
+    """Public interface for aesthetic scoring of video embeddings.
+
+    This class provides a standardized interface for scoring the aesthetic quality
+    of video embeddings using a pre-trained model.
+    """
+
+    def __init__(self, model_dir: str) -> None:
+        """Initialize the aesthetic scorer interface."""
+        super().__init__()
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.dtype = torch.float32
+        self.model_dir = model_dir
+        # These will be initialized in setup()
+        self.mlp = None
+        self.weights_path = None
+
+    @property
+    def conda_env_name(self) -> str:
+        """Get the name of the conda environment required for this model.
+
+        Returns:
+            Name of the conda environment.
+
+        """
+        return "video_splitting"
+
+    @property
+    def model_id_names(self) -> list[str]:
+        """Get the model ID names associated with this aesthetic scorer.
+
+        Returns:
+            A list containing the model ID for aesthetics scoring.
+
+        """
+        return [_AESTHETICS_MODEL_ID]
+
+    def setup(self) -> None:
+        """Set up the aesthetic scoring model by loading weights."""
+        self.weights_path = str(Path(self.model_dir) / self.model_id_names[0] / "model.safetensors")
+
+        self.mlp = MLP()
+        state_dict = load_file(self.weights_path)
+        self.mlp.load_state_dict(state_dict)
+        self.mlp.to(self.device)
+        self.mlp.eval()
+
+    @torch.no_grad()
+    def __call__(self, embeddings: torch.Tensor | npt.NDArray[np.float32]) -> torch.Tensor:
+        """Score the aesthetics of input embeddings.
+
+        Args:
+            embeddings: Input embeddings as either a torch tensor or numpy array.
+
+        Returns:
+            Aesthetic scores for each input embedding.
+
+        """
+        if isinstance(embeddings, np.ndarray):
+            embeddings = torch.from_numpy(embeddings.copy())
+        return self.mlp(embeddings.to(self.device)).squeeze(1)  # type: ignore[no-any-return]
diff --git a/ray-curator/ray_curator/models/clip.py b/ray-curator/ray_curator/models/clip.py
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Model Clips."""
+
+from pathlib import Path
+from typing import Final
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from torchvision import transforms  # type: ignore[import-untyped]
+from transformers import CLIPModel
+
+from .aesthetics import AestheticScorer
+from .base import ModelInterface
+
+_CLIP_MODEL_ID: Final = "openai/clip-vit-large-patch14"
+
+
+class CLIPImageEmbeddings(ModelInterface):
+    """Interface for generating CLIP image embeddings from input images."""
+
+    def __init__(self, model_dir: str) -> None:
+        """Initialize the CLIPImageEmbeddings model."""
+        super().__init__()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.dtype = torch.float32
+        self.model_dir = model_dir
+        # These will be initialized in setup()
+        self.clip = None
+        self.transforms = None
+
+    @property
+    def conda_env_name(self) -> str:
+        """Get the conda environment name.
+
+        Returns:
+            The conda environment name.
+
+        """
+        return "video_splitting"
+
+    @property
+    def model_id_names(self) -> list[str]:
+        """Get the model ID names.
+
+        Returns:
+            A list of model IDs used by this model.
+
+        """
+        return [_CLIP_MODEL_ID]
+
+    def setup(self) -> None:
+        """Set up the CLIPImageEmbeddings model."""
+        weight_file = str(Path(self.model_dir) / self.model_id_names[0])
+        self.clip = CLIPModel.from_pretrained(weight_file).to(self.device).eval()
+
+        # torchvision transforms that match CLIP preprocessor_config.json:
+        self.transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    224,
+                    interpolation=transforms.InterpolationMode.BICUBIC,
+                    antialias=True,
+                ),
+                transforms.CenterCrop(224),
+                transforms.ConvertImageDtype(torch.float32),  # scales [0, 255] to [0, 1]
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ],
+        )
+
+    @torch.no_grad()
+    def __call__(self, images: torch.Tensor | npt.NDArray[np.uint8]) -> torch.Tensor:
+        """Call the CLIPImageEmbeddings model.
+
+        Args:
+            images: The images to embed.
+
+        Returns:
+            The embeddings.
+
+        """
+        if isinstance(images, np.ndarray):
+            # (N, H, W, C) -> (N, C, H, W)
+            images = torch.from_numpy(images).permute(0, 3, 1, 2).to(self.device)
+
+        inputs = self.transforms(images)
+        embed = self.clip.get_image_features(pixel_values=inputs)
+
+        # Normalize embeddings
+        return embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)  # type: ignore[no-any-return]
+
+class CLIPAestheticScorer(ModelInterface):
+    """A model that chains CLIPImageEmbeddings and AestheticScorer models."""
+
+    def __init__(self, model_dir: str) -> None:
+        """Initialize the CLIPAestheticScorer model."""
+        super().__init__()
+        self.model_dir = model_dir
+        self._clip_model: CLIPImageEmbeddings | None = None
+        self._aesthetic_model: AestheticScorer | None = None
+
+    @property
+    def conda_env_name(self) -> str:
+        """Get the conda environment name.
+
+        Returns:
+            The conda environment name.
+
+        """
+        return "video_splitting"
+
+    @property
+    def model_id_names(self) -> list[str]:
+        """Get the model ID names.
+
+        Returns:
+            A list of model IDs used by this model.
+
+        """
+        return [_CLIP_MODEL_ID]
+
+    def setup(self) -> None:
+        """Set up the CLIPAestheticScorer model."""
+        self._clip_model = CLIPImageEmbeddings(model_dir=self.model_dir)
+        self._aesthetic_model = AestheticScorer(model_dir=self.model_dir)
+        self._clip_model.setup()
+        self._aesthetic_model.setup()
+
+    def __call__(self, images: torch.Tensor | npt.NDArray[np.uint8]) -> torch.Tensor:
+        """Call the CLIPAestheticScorer model.
+
+        Args:
+            images: The images to score.
+
+        Returns:
+            The scores.
+
+        """
+        if self._clip_model is None or self._aesthetic_model is None:
+            msg = "CLIPAestheticScorer model not initialized"
+            raise RuntimeError(msg)
+        embeddings = self._clip_model(images)
+        return self._aesthetic_model(embeddings)