southparkcommons · sahil-shubham · Aug 29, 2025 · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025
diff --git a/code/python/core/config.py b/code/python/core/config.py
@@ -327,9 +327,9 @@ def load_retrieval_config(self, path: str = "config_retrieval.yaml"):
         for name, cfg in data.get("endpoints", {}).items():
             # Use the new method for all configuration values
             self.retrieval_endpoints[name] = RetrievalProviderConfig(
-                api_key=self._get_config_value(cfg.get("api_key_env")),
+                api_key=self._get_config_value(cfg.get("api_key_env")) or cfg.get("api_key"),
                 api_key_env=cfg.get("api_key_env"),  # Store the env var name
-                api_endpoint=self._get_config_value(cfg.get("api_endpoint_env")),
+                api_endpoint=self._get_config_value(cfg.get("api_endpoint_env")) or cfg.get("api_endpoint"),
                 api_endpoint_env=cfg.get("api_endpoint_env"),  # Store the env var name
                 database_path=self._get_config_value(cfg.get("database_path")),
                 index_name=self._get_config_value(cfg.get("index_name")),

diff --git a/code/python/data_loading/db_load.py b/code/python/data_loading/db_load.py
@@ -872,21 +872,24 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
 
                             for j, embedding in enumerate(embeddings):
                                 if j < len(batch_docs):
-                                    doc = batch_docs[j].copy()  # Create a copy of the document
+                                    original_doc = batch_docs[j]
 
-                                    # Add embedding to document
-                                    doc["embedding"] = embedding
+                                    # Create embedding-only document for upload
+                                    embedding_doc = {
+                                        "id": original_doc["id"],
+                                        "embedding": embedding
+                                    }
 
                                     # Format embedding as string - ensure no newlines
                                     embedding_str = str(embedding).replace(' ', '').replace('\n', '')
 
                                     # Ensure JSON has no newlines
-                                    doc_json = doc['schema_json'].replace('\n', ' ')
+                                    doc_json = original_doc['schema_json'].replace('\n', ' ')
 
                                     # Write to embeddings file
-                                    embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\n")
+                                    embed_file.write(f"{original_doc['url']}\t{doc_json}\t{embedding_str}\n")
 
-                                    docs_with_embeddings.append(doc)
+                                    docs_with_embeddings.append(embedding_doc)
 
                             # Upload batch using the client directly
                             batch_idx = i // batch_size
@@ -1022,9 +1025,13 @@ async def loadUrlListToDB(file_path: str, site: str, batch_size: int = 100, dele
                                 docs_with_embeddings = []
                                 for k, embedding in enumerate(embeddings):
                                     if k < len(batch_docs):
-                                        doc = batch_docs[k].copy()
-                                        doc["embedding"] = embedding
-                                        docs_with_embeddings.append(doc)
+                                        original_doc = batch_docs[k]
+                                        # Create embedding-only document for upload
+                                        embedding_doc = {
+                                            "id": original_doc["id"],
+                                            "embedding": embedding
+                                        }
+                                        docs_with_embeddings.append(embedding_doc)
 
                                 # Upload batch directly with client
                                 batch_idx = j // batch_size

diff --git a/code/python/data_loading/db_load_utils.py b/code/python/data_loading/db_load_utils.py
@@ -190,8 +190,9 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
             item_json = json.dumps(item)
 
             # Add document to batch
+            # Expect the item to already have an 'id' field from the original service
             doc = {
-                "id": str(int64_hash(item_url)),
+                "id": item.get("id", str(int64_hash(item_url))),  # Use existing id or fallback to generated one
                 "schema_json": item_json,
                 "url": item_url,
                 "name": get_item_name(item),
@@ -245,8 +246,9 @@ def documents_from_csv_line(line, site):
         name = get_item_name(item)
 
         # Ensure no None values in the document
+        # Expect the item to already have an 'id' field from the original service
         doc = {
-            "id": str(int64_hash(item_url)),
+            "id": item.get("id", str(int64_hash(item_url))),  # Use existing id or fallback to generated one
             "embedding": embedding,
             "schema_json": json.dumps(item),
             "url": item_url or "",

diff --git a/code/python/retrieval_providers/postgres_client.py b/code/python/retrieval_providers/postgres_client.py
@@ -59,6 +59,14 @@
         self.database_path = self.endpoint_config.database_path
         self.default_collection_name = self.endpoint_config.index_name or "nlweb_collection"
 
+        logger.debug(f"Endpoint config - api_endpoint: {self.api_endpoint}")
+        logger.debug(f"Endpoint config - api_key: {self.api_key}")
+
+        if not self.api_endpoint:
+            logger.error(f"api_endpoint is None/empty for endpoint '{self.endpoint_name}'")
+            logger.error(f"Full endpoint config: {self.endpoint_config}")
+            raise ValueError(f"No connection string found for PostgreSQL endpoint '{self.endpoint_name}'")
+
         self.pg_raw_config = self._get_config_from_postgres_connection_string(self.api_endpoint)
 
         self.host = self.pg_raw_config.get("host")
@@ -98,8 +106,9 @@
         database = parsed_url.path[1:]  # remove leading slash
         query_params = parse_qs(parsed_url.query)
 
-        username = query_params.get('user', [None])[0]
-        password = query_params.get('password', [None])[0]
+        # Get username and password from URL components first, then fall back to query params
+        username = parsed_url.username or query_params.get('user', [None])[0]
+        password = parsed_url.password or query_params.get('password', [None])[0]
 
         return {
             'host': host,
@@ -307,9 +316,9 @@
 
                     for idx, doc in enumerate(batch):
                         try:
-                            # Ensure required fields are present
-                            if not all(k in doc for k in ["id", "url", "name","schema_json", "site", "embedding"]):
-                                missing = [k for k in ["id", "url", "name","schema_json", "site", "embedding"] if k not in doc]
+                            # Ensure required fields are present (only id and embedding needed)
+                            if not all(k in doc for k in ["id", "embedding"]):
+                                missing = [k for k in ["id", "embedding"] if k not in doc]
                                 logger.warning(f"Skipping document with missing fields: {missing}")
                                 continue
 
@@ -329,16 +338,12 @@
                                 print(f"Invalid embedding example: {str(embedding[:5])}...")
                                 continue
 
-                            # Add placeholder for this row
-                            placeholders.append("(%s, %s, %s, %s, %s, %s::vector)")
+                            # Add placeholder for this row (only id and embedding)
+                            placeholders.append("(%s, %s::vector)")
 
-                            # Add values
+                            # Add values (only id and embedding)
                             values.extend([
                                 doc["id"],
-                                doc["url"], 
-                                doc["name"],
-                                doc["schema_json"],
-                                doc["site"],
                                 embedding  # This should be a list of floats
                             ])
 
@@ -352,18 +357,16 @@
                         return 0
 
                     # Build and execute the query
+                    # IMPORTANT: Only update embedding field, assuming document metadata already exists
+                    # Use UPDATE instead of INSERT since documents should already exist
                     query = f"""
-                        INSERT INTO {self.table_name} (id, url, name, schema_json, site, embedding)
-                        VALUES {', '.join(placeholders)}
-                        ON CONFLICT (id) DO UPDATE SET
-                            url = EXCLUDED.url,
-                            name = EXCLUDED.name,
-                            schema_json = EXCLUDED.schema_json,
-                            site = EXCLUDED.site,
-                            embedding = EXCLUDED.embedding
+                        UPDATE {self.table_name} 
+                        SET embedding = data.embedding
+                        FROM (VALUES {', '.join(placeholders)}) AS data(id, embedding)
+                        WHERE {self.table_name}.id = data.id::integer
                     """
 
-                    print(f"Executing query with {len(values) // 6} rows")
+                    print(f"Executing query with {len(values) // 2} rows")
                     try:
                         await cur.execute(query, values)
                         count = cur.rowcount
@@ -790,15 +793,15 @@
        """Example test function for the PgVector client"""
        client = PgVectorClient()

        # Test connection first
        connection_info = await client.test_connection()
        print("Connection test results:")
        print(f"  Success: {connection_info['success']}")
        if connection_info.get("error"):
            print(f"  Error: {connection_info['error']}")
            return

        print(f"  PostgreSQL version: {connection_info['database_version']}")
        print(f"  pgvector installed: {connection_info['pgvector_installed']}")
        print(f"  Table exists: {connection_info['table_exists']}")
        print(f"  Document count: {connection_info['document_count']}")

diff --git a/code/python/webserver/middleware/auth.py b/code/python/webserver/middleware/auth.py
@@ -19,7 +19,10 @@
     '/static',
     '/html',
     # Allow public access to ask endpoint for now (can be changed)
-    '/ask'
+    '/ask',
+    # TODO: make this endpoint private
+    # Indexing endpoints
+    '/api/index/json'
 }
 
 

diff --git a/code/python/webserver/routes/__init__.py b/code/python/webserver/routes/__init__.py
@@ -4,6 +4,7 @@
 from .api import setup_api_routes
 from .health import setup_health_routes
 from .mcp import setup_mcp_routes
+from .indexing import setup_indexing_routes
 
 
 def setup_routes(app):
@@ -12,6 +13,7 @@ def setup_routes(app):
     setup_api_routes(app)
     setup_health_routes(app)
     setup_mcp_routes(app)
+    setup_indexing_routes(app)
 
     # TODO: Add these as we implement them
     # setup_oauth_routes(app)

diff --git a/code/python/webserver/routes/indexing.py b/code/python/webserver/routes/indexing.py
@@ -0,0 +1,101 @@
+"""JSON indexing API routes"""
+
+import json
+import logging
+import tempfile
+import os
+from aiohttp import web
+
+from data_loading.db_load import loadJsonToDB
+
+logger = logging.getLogger(__name__)
+
+
+def setup_indexing_routes(app: web.Application):
+    """Setup indexing API routes"""
+    app.router.add_post('/api/index/json', index_json_handler)
+
+
+async def index_json_handler(request: web.Request) -> web.Response:
+    """
+    Handle direct JSON data indexing.
+
+    Expects JSON body with:
+    - data (array or object): JSON data to index (single object or array of objects)
+    - site (str): Site identifier (e.g., "user_qdrant", "company_qdrant", etc.)
+    - batch_size (int, optional): Batch size for processing (default: 100)
+    - delete_existing (bool, optional): Whether to delete existing entries for this site (default: false)
+    - database (str, optional): Specific database endpoint to use
+    """
+    try:
+        body = await request.json()
+
+        data = body.get('data')
+        site = body.get('site')
+
+        if not data or not site:
+            return web.json_response({
+                "success": False,
+                "error": "Missing required parameters: data and site"
+            }, status=400)
+
+        batch_size = body.get('batch_size', 100)
+        delete_existing = body.get('delete_existing', False)
+        database = body.get('database')
+
+        # Create temporary JSONL file from the data
+        temp_path = None
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as temp_file:
+                if isinstance(data, list):
+                    # Array of objects
+                    for item in data:
+                        temp_file.write(json.dumps(item) + '\n')
+                else:
+                    # Single object
+                    temp_file.write(json.dumps(data) + '\n')
+
+                temp_path = temp_file.name
+
+            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
@@ -57,7 +57,8 @@
                temp_path = temp_file.name
-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            site_for_log = site.replace('\r', '').replace('\n', '') if isinstance(site, str) else str(site)
+            logger.info(f"Starting JSON indexing: site={site_for_log}, objects={len(data) if isinstance(data, list) else 1}")
            # Use loadJsonToDB with the temporary file
            total_documents = await loadJsonToDB(
@@ -69,7 +70,7 @@
                database=database
            )
-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site_for_log}")
            # Return success=False if no documents were indexed
            success = total_documents > 0
@@ -57,7 +57,8 @@

                temp_path = temp_file.name

-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            site_for_log = site.replace('\r', '').replace('\n', '') if isinstance(site, str) else str(site)
+            logger.info(f"Starting JSON indexing: site={site_for_log}, objects={len(data) if isinstance(data, list) else 1}")

            # Use loadJsonToDB with the temporary file
            total_documents = await loadJsonToDB(
@@ -69,7 +70,7 @@
                database=database
            )

-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site_for_log}")

            # Return success=False if no documents were indexed
            success = total_documents > 0
+
+            # Use loadJsonToDB with the temporary file
+            total_documents = await loadJsonToDB(
+                file_path=temp_path,
+                site=site,
+                batch_size=batch_size,
+                delete_existing=delete_existing,
+                force_recompute=False,
+                database=database
+            )
+
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
@@ -32,6 +32,8 @@
        data = body.get('data')
        site = body.get('site')
+        # Sanitize site input to prevent log injection
+        safe_site = site.replace('\r', '').replace('\n', '') if site else ''
        if not data or not site:
            return web.json_response({
@@ -57,7 +59,7 @@
                temp_path = temp_file.name
-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            logger.info(f"Starting JSON indexing: site={safe_site}, objects={len(data) if isinstance(data, list) else 1}")
            # Use loadJsonToDB with the temporary file
            total_documents = await loadJsonToDB(
@@ -69,7 +71,7 @@
                database=database
            )
-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {safe_site}")
            # Return success=False if no documents were indexed
            success = total_documents > 0
@@ -32,6 +32,8 @@

        data = body.get('data')
        site = body.get('site')
+        # Sanitize site input to prevent log injection
+        safe_site = site.replace('\r', '').replace('\n', '') if site else ''

        if not data or not site:
            return web.json_response({
@@ -57,7 +59,7 @@

                temp_path = temp_file.name

-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            logger.info(f"Starting JSON indexing: site={safe_site}, objects={len(data) if isinstance(data, list) else 1}")

            # Use loadJsonToDB with the temporary file
            total_documents = await loadJsonToDB(
@@ -69,7 +71,7 @@
                database=database
            )

-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {safe_site}")

            # Return success=False if no documents were indexed
            success = total_documents > 0
+
+            # Return success=False if no documents were indexed
+            success = total_documents > 0
+
+            return web.json_response({
+                "success": success,
+                "message": f"Successfully indexed {total_documents} documents" if success else f"Failed to index documents - 0 documents processed",
+                "details": {
+                    "total_documents": total_documents,
+                    "site": site,
+                    "input_objects": len(data) if isinstance(data, list) else 1,
+                    "database": database or "default"
+                }
+            })
+
+        finally:
+            # Clean up temporary file
+            if temp_path and os.path.exists(temp_path):
+                try:
+                    os.unlink(temp_path)
+                except Exception:
+                    pass
+
+    except Exception as e:
+        logger.error(f"Error in JSON indexing: {e}", exc_info=True)
+        return web.json_response({
+            "success": False,
+            "error": str(e)
+        }, status=500)
diff --git a/config/config_retrieval.yaml b/config/config_retrieval.yaml
@@ -1,4 +1,4 @@
-write_endpoint: user_qdrant
+write_endpoint: postgres
 
 endpoints:
 
@@ -90,11 +90,10 @@ endpoints:
     use_knn: false
 
   postgres:
-    enabled: false
-    # Database connection details (i.e. "postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require")
+    enabled: true
+    # Database connection details
+    # api_endpoint: "postgres://postgres:postgres@localhost:5432/spc"
     api_endpoint_env: POSTGRES_CONNECTION_STRING
-    # Password for authentication
-    api_key_env: POSTGRES_PASSWORD
     # Index name to search in
     index_name: documents
     # Specify the database type
@@ -117,7 +116,7 @@ endpoints:
 
   # Same Qdrant server, different collection for companies
   company_qdrant:
-    enabled: true
+    enabled: false
     # Connect to the same Qdrant server
     api_endpoint_env: QDRANT_URL
     # Same API key for authentication
@@ -129,7 +128,7 @@ endpoints:
 
   # Same Qdrant server, different collection for users
   user_qdrant:
-    enabled: true
+    enabled: false
     # Connect to the same Qdrant server
     api_endpoint_env: QDRANT_URL
     # Same API key for authentication