southparkcommons · sahil-shubham · Aug 18, 2025
diff --git a/code/config/config.py b/code/config/config.py
@@ -27,6 +27,11 @@ class LLMProviderConfig:
     endpoint: Optional[str] = None
     api_version: Optional[str] = None
 
+@dataclass
+class EmbeddingModeConfig:
+    default: str = "single"
+    document_types: Dict[str, str] = field(default_factory=dict)
+
 @dataclass
 class EmbeddingProviderConfig:
     api_key: Optional[str] = None
@@ -201,6 +206,13 @@ def load_embedding_config(self, path: str = "config_embedding.yaml"):
 
         self.preferred_embedding_provider: str = data["preferred_provider"]
         self.embedding_providers: Dict[str, EmbeddingProviderConfig] = {}
+
+        # Load embedding mode configuration
+        embedding_mode_data = data.get("embedding_mode", {})
+        self.embedding_mode = EmbeddingModeConfig(
+            default=embedding_mode_data.get("default", "single"),
+            document_types=embedding_mode_data.get("document_types", {})
+        )
 
         for name, cfg in data.get("providers", {}).items():
             # Extract configuration values from the YAML
@@ -517,6 +529,18 @@ def get_llm_provider(self, provider_name: Optional[str] = None) -> Optional[LLMP
             return self.llm_endpoints[self.preferred_llm_endpoint]
 
         return None
+
+    def get_embedding_mode(self, document_type: str) -> str:
+        """Get the embedding mode for a specific document type."""
+        if not hasattr(self, 'embedding_mode'):
+            return "single"  # Default fallback
+
+        # Check if there's a specific mode for this document type
+        if document_type in self.embedding_mode.document_types:
+            return self.embedding_mode.document_types[document_type]
+
+        # Fall back to default mode
+        return self.embedding_mode.default
 
 # Global singleton
 CONFIG = AppConfig()
diff --git a/code/config/config_embedding.yaml b/code/config/config_embedding.yaml
@@ -1,5 +1,16 @@
 preferred_provider: openai
 
+# Embedding generation mode configuration
+embedding_mode:
+  # Default mode for documents (single or multi)
+  default: single
+
+  # Per-document-type configuration
+  # Set to 'multi' to generate multiple specialized embeddings per document
+  # Set to 'single' to generate one embedding per document (legacy behavior)
+  document_types:
+    company: multi
+
 providers:
   openai:
     api_key_env: OPENAI_API_KEY

diff --git a/code/scraping/incrementalCrawlAndLoad.py b/code/scraping/incrementalCrawlAndLoad.py
@@ -323,9 +323,9 @@ async def _process_single_url(self, url: str) -> bool:
                     else:
                         self.stats["schema_types"][schema_type] = 1
 
-            # Step 3: Prepare documents for database
+            # Step 3: Prepare documents for database with specialized embedding texts
             documents_to_upload = []
-            docs, _ = prepare_documents_from_json(final_url, schemas_str, self.db_name)
+            docs, embedding_texts = prepare_documents_from_json(final_url, schemas_str, self.db_name)
             documents_to_upload.extend(docs)
 
             # Step 4: Generate embeddings and upload
@@ -335,8 +335,13 @@ async def _process_single_url(self, url: str) -> bool:
                 provider_config = CONFIG.get_embedding_provider(provider)
                 model = provider_config.model if provider_config else None
 
-                # Extract texts for embedding
-                texts = [doc["schema_json"] for doc in documents_to_upload]
+                # Use specialized embedding texts if available, otherwise fall back to schema_json
+                if embedding_texts and len(embedding_texts) == len(documents_to_upload):
+                    texts = embedding_texts
+                    logger.debug(f"Using {len(texts)} specialized embedding texts from multi-embedding generator")
+                else:
+                    texts = [doc["schema_json"] for doc in documents_to_upload]
+                    logger.debug(f"Falling back to schema_json for {len(texts)} documents (multi-embedding texts not available)")
 
                 # Generate embeddings
                 embeddings = await batch_get_embeddings(texts, provider, model)

diff --git a/code/tools/db_load.py b/code/tools/db_load.py
@@ -821,6 +821,9 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
             if json_only_format:
                 print("Detected JSON-only format. URLs will be extracted from within the JSON data.")
 
+            # Track both documents and their specialized embedding texts
+            all_embedding_texts = []
+
             # Process each line to extract documents
             for line in lines:
                 try:
@@ -830,9 +833,10 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
                     if url is None or json_data is None:
                         continue
 
-                    # Prepare documents
-                    documents, _ = prepare_documents_from_json(url, json_data, site)
+                    # Prepare documents with specialized embedding texts
+                    documents, embedding_texts = prepare_documents_from_json(url, json_data, site)
                     all_documents.extend(documents)
+                    all_embedding_texts.extend(embedding_texts)
                 except Exception as e:
                     print(f"Error processing line: {str(e)}")
                     continue
@@ -844,8 +848,13 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
 
             # Open file to write documents with embeddings
             with open(embeddings_path, 'w', encoding='utf-8') as embed_file:
-                # Extract texts for embedding
-                texts = [doc["schema_json"] for doc in all_documents]
+                # Use specialized embedding texts if available, otherwise fall back to schema_json
+                if all_embedding_texts and len(all_embedding_texts) == len(all_documents):
+                    texts = all_embedding_texts
+                    print(f"Using {len(texts)} specialized embedding texts from multi-embedding generator")
+                else:
+                    texts = [doc["schema_json"] for doc in all_documents]
+                    print(f"Falling back to schema_json for {len(texts)} documents (multi-embedding texts not available)")
 
                 # Process in batches
                 total_documents = 0
@@ -877,8 +886,11 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
                                     # Ensure JSON has no newlines
                                     doc_json = doc['schema_json'].replace('\n', ' ')
 
-                                    # Write to embeddings file
-                                    embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\n")
+                                    # Write to embeddings file in new multi-embedding format
+                                    # Format: URL \t JSON \t embedding \t base_doc_id \t embedding_type
+                                    base_doc_id = doc.get('base_doc_id', '')
+                                    embedding_type = doc.get('embedding_type', '')
+                                    embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\t{base_doc_id}\t{embedding_type}\n")
 
                                     docs_with_embeddings.append(doc)
 

diff --git a/code/tools/db_load_utils.py b/code/tools/db_load_utils.py
@@ -156,7 +156,11 @@ def get_item_name(item: Dict[str, Any]) -> str:
 
 def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[List[Dict[str, Any]], List[str]]:
     """
-    Prepare documents from URL and JSON data.
+    Prepare documents from URL and JSON data using configurable embedding approach.
+
+    Uses configuration to determine embedding mode per document type:
+    - Single mode: Creates one embedding per document (legacy behavior)
+    - Multi mode: Creates multiple specialized embeddings per document for better search
 
     Args:
         url: URL for the item
@@ -167,6 +171,10 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
         Tuple of (documents, texts_for_embedding)
     """
     try:
+        # Import dependencies
+        from tools.multi_embedding_generator import generate_document_embeddings, detect_item_type
+        from config.config import CONFIG
+
         # Parse and trim the JSON
         json_obj = json.loads(json_data)
         trimmed_json = trim_schema_json(json_obj, site)
@@ -189,17 +197,40 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
             item_url = url if i == 0 else f"{url}#{i}"
             item_json = json.dumps(item)
 
-            # Add document to batch
-            doc = {
-                "id": str(int64_hash(item_url)),
-                "schema_json": item_json,
-                "url": item_url,
-                "name": get_item_name(item),
-                "site": site
-            }
+            # Determine embedding mode based on configuration
+            item_type = detect_item_type(item)
+            embedding_mode = CONFIG.get_embedding_mode(item_type)
 
-            documents.append(doc)
-            texts.append(item_json)
+            if embedding_mode == "multi":
+                # Generate multiple specialized embeddings for this item
+                multi_embeddings = generate_document_embeddings(item, item_url, site)
+
+                # Create a document for each specialized embedding
+                for emb_data in multi_embeddings:
+                    doc = {
+                        "id": emb_data["id"],
+                        "base_doc_id": emb_data["base_doc_id"],
+                        "embedding_type": emb_data["embedding_type"],
+                        "schema_json": item_json,
+                        "url": item_url,
+                        "name": get_item_name(item),
+                        "site": site
+                    }
+
+                    documents.append(doc)
+                    texts.append(emb_data["embedding_text"])
+            else:
+                # Single embedding mode (legacy behavior)
+                doc = {
+                    "id": str(int64_hash(item_url)),
+                    "schema_json": item_json,
+                    "url": item_url,
+                    "name": get_item_name(item),
+                    "site": site
+                }
+
+                documents.append(doc)
+                texts.append(item_json)
 
         return documents, texts
     except Exception as e:
@@ -210,17 +241,38 @@ def documents_from_csv_line(line, site):
     """
     Parse a line with URL, JSON, and embedding into document objects.
 
+    Supports both legacy format (single embedding) and new multi-embedding format:
+    - Legacy: URL \t JSON \t embedding
+    - Multi: URL \t JSON \t embedding \t base_doc_id \t embedding_type
+
     Args:
-        line: Tab-separated line with URL, JSON, and embedding
+        line: Tab-separated line with URL, JSON, and embedding data
         site: Site identifier
 
     Returns:
         List of document objects
     """
     try:
-        url, json_data, embedding_str = line.strip().split('\t')
+        parts = line.strip().split('\t')
+
+        if len(parts) < 3:
+            print(f"Error: Line has insufficient columns ({len(parts)} < 3)")
+            return []
+
+        url = parts[0]
+        json_data = parts[1]
+        embedding_str = parts[2]
+
+        # Check if this is the new multi-embedding format
+        is_multi_format = len(parts) >= 5
+        base_doc_id = parts[3] if is_multi_format else None
+        embedding_type = parts[4] if is_multi_format else None
+
+        # Parse embedding
         embedding_str = embedding_str.replace("[", "").replace("]", "") 
         embedding = [float(x) for x in embedding_str.split(',')]
+
+        # Parse and trim JSON
         js = json.loads(json_data)
         js = trim_schema_json(js, site)
     except Exception as e:
@@ -240,19 +292,33 @@ def documents_from_csv_line(line, site):
         if item is None:
             continue
 
-        # No longer filtering by should_include_item - trimming already handles this
+        # URL handling
         item_url = url if i == 0 else f"{url}#{i}"
         name = get_item_name(item)
 
-        # Ensure no None values in the document
-        doc = {
-            "id": str(int64_hash(item_url)),
-            "embedding": embedding,
-            "schema_json": json.dumps(item),
-            "url": item_url or "",
-            "name": name or "Unnamed Item",
-            "site": site or "unknown"
-        }
+        # Create document based on format
+        if is_multi_format:
+            # New multi-embedding format
+            doc = {
+                "id": f"{base_doc_id}_{embedding_type}",  # Use the stored multi-embedding ID
+                "base_doc_id": base_doc_id,  # Link back to original document
+                "embedding_type": embedding_type,  # Type of this embedding
+                "embedding": embedding,
+                "schema_json": json.dumps(item),
+                "url": item_url or "",
+                "name": name or "Unnamed Item", 
+                "site": site or "unknown"
+            }
+        else:
+            # Legacy single embedding format
+            doc = {
+                "id": str(int64_hash(item_url)),
+                "embedding": embedding,
+                "schema_json": json.dumps(item),
+                "url": item_url or "",
+                "name": name or "Unnamed Item",
+                "site": site or "unknown"
+            }
 
         # Additional validation to ensure no None values
         for key, value in doc.items():