+
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions code/python/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,9 @@ def load_retrieval_config(self, path: str = "config_retrieval.yaml"):
for name, cfg in data.get("endpoints", {}).items():
# Use the new method for all configuration values
self.retrieval_endpoints[name] = RetrievalProviderConfig(
api_key=self._get_config_value(cfg.get("api_key_env")),
api_key=self._get_config_value(cfg.get("api_key_env")) or cfg.get("api_key"),
api_key_env=cfg.get("api_key_env"), # Store the env var name
api_endpoint=self._get_config_value(cfg.get("api_endpoint_env")),
api_endpoint=self._get_config_value(cfg.get("api_endpoint_env")) or cfg.get("api_endpoint"),
api_endpoint_env=cfg.get("api_endpoint_env"), # Store the env var name
database_path=self._get_config_value(cfg.get("database_path")),
index_name=self._get_config_value(cfg.get("index_name")),
Expand Down
25 changes: 16 additions & 9 deletions code/python/data_loading/db_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,21 +872,24 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_

for j, embedding in enumerate(embeddings):
if j < len(batch_docs):
doc = batch_docs[j].copy() # Create a copy of the document
original_doc = batch_docs[j]

# Add embedding to document
doc["embedding"] = embedding
# Create embedding-only document for upload
embedding_doc = {
"id": original_doc["id"],
"embedding": embedding
}

# Format embedding as string - ensure no newlines
embedding_str = str(embedding).replace(' ', '').replace('\n', '')

# Ensure JSON has no newlines
doc_json = doc['schema_json'].replace('\n', ' ')
doc_json = original_doc['schema_json'].replace('\n', ' ')

# Write to embeddings file
embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\n")
embed_file.write(f"{original_doc['url']}\t{doc_json}\t{embedding_str}\n")

docs_with_embeddings.append(doc)
docs_with_embeddings.append(embedding_doc)

# Upload batch using the client directly
batch_idx = i // batch_size
Expand Down Expand Up @@ -1022,9 +1025,13 @@ async def loadUrlListToDB(file_path: str, site: str, batch_size: int = 100, dele
docs_with_embeddings = []
for k, embedding in enumerate(embeddings):
if k < len(batch_docs):
doc = batch_docs[k].copy()
doc["embedding"] = embedding
docs_with_embeddings.append(doc)
original_doc = batch_docs[k]
# Create embedding-only document for upload
embedding_doc = {
"id": original_doc["id"],
"embedding": embedding
}
docs_with_embeddings.append(embedding_doc)

# Upload batch directly with client
batch_idx = j // batch_size
Expand Down
6 changes: 4 additions & 2 deletions code/python/data_loading/db_load_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,9 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
item_json = json.dumps(item)

# Add document to batch
# Expect the item to already have an 'id' field from the original service
doc = {
"id": str(int64_hash(item_url)),
"id": item.get("id", str(int64_hash(item_url))), # Use existing id or fallback to generated one
"schema_json": item_json,
"url": item_url,
"name": get_item_name(item),
Expand Down Expand Up @@ -245,8 +246,9 @@ def documents_from_csv_line(line, site):
name = get_item_name(item)

# Ensure no None values in the document
# Expect the item to already have an 'id' field from the original service
doc = {
"id": str(int64_hash(item_url)),
"id": item.get("id", str(int64_hash(item_url))), # Use existing id or fallback to generated one
"embedding": embedding,
"schema_json": json.dumps(item),
"url": item_url or "",
Expand Down
45 changes: 24 additions & 21 deletions code/python/retrieval_providers/postgres_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@
self.database_path = self.endpoint_config.database_path
self.default_collection_name = self.endpoint_config.index_name or "nlweb_collection"

logger.debug(f"Endpoint config - api_endpoint: {self.api_endpoint}")
logger.debug(f"Endpoint config - api_key: {self.api_key}")

if not self.api_endpoint:
logger.error(f"api_endpoint is None/empty for endpoint '{self.endpoint_name}'")
logger.error(f"Full endpoint config: {self.endpoint_config}")
raise ValueError(f"No connection string found for PostgreSQL endpoint '{self.endpoint_name}'")

self.pg_raw_config = self._get_config_from_postgres_connection_string(self.api_endpoint)

self.host = self.pg_raw_config.get("host")
Expand Down Expand Up @@ -98,8 +106,9 @@
database = parsed_url.path[1:] # remove leading slash
query_params = parse_qs(parsed_url.query)

username = query_params.get('user', [None])[0]
password = query_params.get('password', [None])[0]
# Get username and password from URL components first, then fall back to query params
username = parsed_url.username or query_params.get('user', [None])[0]
password = parsed_url.password or query_params.get('password', [None])[0]

return {
'host': host,
Expand Down Expand Up @@ -307,9 +316,9 @@

for idx, doc in enumerate(batch):
try:
# Ensure required fields are present
if not all(k in doc for k in ["id", "url", "name","schema_json", "site", "embedding"]):
missing = [k for k in ["id", "url", "name","schema_json", "site", "embedding"] if k not in doc]
# Ensure required fields are present (only id and embedding needed)
if not all(k in doc for k in ["id", "embedding"]):
missing = [k for k in ["id", "embedding"] if k not in doc]
logger.warning(f"Skipping document with missing fields: {missing}")
continue

Expand All @@ -329,16 +338,12 @@
print(f"Invalid embedding example: {str(embedding[:5])}...")
continue

# Add placeholder for this row
placeholders.append("(%s, %s, %s, %s, %s, %s::vector)")
# Add placeholder for this row (only id and embedding)
placeholders.append("(%s, %s::vector)")

# Add values
# Add values (only id and embedding)
values.extend([
doc["id"],
doc["url"],
doc["name"],
doc["schema_json"],
doc["site"],
embedding # This should be a list of floats
])

Expand All @@ -352,18 +357,16 @@
return 0

# Build and execute the query
# IMPORTANT: Only update embedding field, assuming document metadata already exists
# Use UPDATE instead of INSERT since documents should already exist
query = f"""
INSERT INTO {self.table_name} (id, url, name, schema_json, site, embedding)
VALUES {', '.join(placeholders)}
ON CONFLICT (id) DO UPDATE SET
url = EXCLUDED.url,
name = EXCLUDED.name,
schema_json = EXCLUDED.schema_json,
site = EXCLUDED.site,
embedding = EXCLUDED.embedding
UPDATE {self.table_name}
SET embedding = data.embedding
FROM (VALUES {', '.join(placeholders)}) AS data(id, embedding)
WHERE {self.table_name}.id = data.id::integer
"""

print(f"Executing query with {len(values) // 6} rows")
print(f"Executing query with {len(values) // 2} rows")
try:
await cur.execute(query, values)
count = cur.rowcount
Expand Down Expand Up @@ -790,15 +793,15 @@
"""Example test function for the PgVector client"""
client = PgVectorClient()

# Test connection first

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
connection_info = await client.test_connection()
print("Connection test results:")

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
print(f" Success: {connection_info['success']}")
if connection_info.get("error"):
print(f" Error: {connection_info['error']}")

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
return

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
print(f" PostgreSQL version: {connection_info['database_version']}")

Check failure

Code scanning / CodeQL

Clear-text logging of sensitive information High

This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
This expression logs
sensitive data (password)
as clear text.
print(f" pgvector installed: {connection_info['pgvector_installed']}")
print(f" Table exists: {connection_info['table_exists']}")
print(f" Document count: {connection_info['document_count']}")
Expand Down
5 changes: 4 additions & 1 deletion code/python/webserver/middleware/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
'/static',
'/html',
# Allow public access to ask endpoint for now (can be changed)
'/ask'
'/ask',
# TODO: make this endpoint private
# Indexing endpoints
'/api/index/json'
}


Expand Down
2 changes: 2 additions & 0 deletions code/python/webserver/routes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .api import setup_api_routes
from .health import setup_health_routes
from .mcp import setup_mcp_routes
from .indexing import setup_indexing_routes


def setup_routes(app):
Expand All @@ -12,6 +13,7 @@ def setup_routes(app):
setup_api_routes(app)
setup_health_routes(app)
setup_mcp_routes(app)
setup_indexing_routes(app)

# TODO: Add these as we implement them
# setup_oauth_routes(app)
Expand Down
101 changes: 101 additions & 0 deletions code/python/webserver/routes/indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""JSON indexing API routes"""

import json
import logging
import tempfile
import os
from aiohttp import web

from data_loading.db_load import loadJsonToDB

logger = logging.getLogger(__name__)


def setup_indexing_routes(app: web.Application):
"""Setup indexing API routes"""
app.router.add_post('/api/index/json', index_json_handler)


async def index_json_handler(request: web.Request) -> web.Response:
"""
Handle direct JSON data indexing.

Expects JSON body with:
- data (array or object): JSON data to index (single object or array of objects)
- site (str): Site identifier (e.g., "user_qdrant", "company_qdrant", etc.)
- batch_size (int, optional): Batch size for processing (default: 100)
- delete_existing (bool, optional): Whether to delete existing entries for this site (default: false)
- database (str, optional): Specific database endpoint to use
"""
try:
body = await request.json()

data = body.get('data')
site = body.get('site')

if not data or not site:
return web.json_response({
"success": False,
"error": "Missing required parameters: data and site"
}, status=400)

batch_size = body.get('batch_size', 100)
delete_existing = body.get('delete_existing', False)
database = body.get('database')

# Create temporary JSONL file from the data
temp_path = None
try:
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') as temp_file:
if isinstance(data, list):
# Array of objects
for item in data:
temp_file.write(json.dumps(item) + '\n')
else:
# Single object
temp_file.write(json.dumps(data) + '\n')

temp_path = temp_file.name

logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")

Check failure

Code scanning / CodeQL

Log Injection High

This log entry depends on a
user-provided value
.

Copilot Autofix

AI about 2 months ago

To fix this log injection vulnerability, sanitize the site user-supplied value before logging it. The generally recommended approach for plain-text logs is to strip newlines and carriage returns (\r, \n, \r\n) from the value to prevent attackers from injecting extra log lines. This can be achieved by calling .replace() on the variable before interpolation in the log message.

Best practice is to sanitize as close as possible to usage, but for readability and to prevent accidental omission, it makes sense to sanitize just above where the variable is first used in a log statement. In this file, sanitize site just before it's logged:

  • Replace newlines and carriage returns from site using site.replace('\r', '').replace('\n', '') and use the sanitized variable in the log message.
  • Optionally, explicitly mark the sanitized version, e.g. site_for_log.
  • No external dependencies are required.

Make this change where site is logged on line 60 (and optionally line 72), by introducing a sanitized version just before.


Suggested changeset 1
code/python/webserver/routes/indexing.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/code/python/webserver/routes/indexing.py b/code/python/webserver/routes/indexing.py
--- a/code/python/webserver/routes/indexing.py
+++ b/code/python/webserver/routes/indexing.py
@@ -57,7 +57,8 @@
                 
                 temp_path = temp_file.name
             
-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            site_for_log = site.replace('\r', '').replace('\n', '') if isinstance(site, str) else str(site)
+            logger.info(f"Starting JSON indexing: site={site_for_log}, objects={len(data) if isinstance(data, list) else 1}")
             
             # Use loadJsonToDB with the temporary file
             total_documents = await loadJsonToDB(
@@ -69,7 +70,7 @@
                 database=database
             )
             
-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site_for_log}")
             
             # Return success=False if no documents were indexed
             success = total_documents > 0
EOF
@@ -57,7 +57,8 @@

temp_path = temp_file.name

logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
site_for_log = site.replace('\r', '').replace('\n', '') if isinstance(site, str) else str(site)
logger.info(f"Starting JSON indexing: site={site_for_log}, objects={len(data) if isinstance(data, list) else 1}")

# Use loadJsonToDB with the temporary file
total_documents = await loadJsonToDB(
@@ -69,7 +70,7 @@
database=database
)

logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site_for_log}")

# Return success=False if no documents were indexed
success = total_documents > 0
Copilot is powered by AI and may make mistakes. Always verify output.

# Use loadJsonToDB with the temporary file
total_documents = await loadJsonToDB(
file_path=temp_path,
site=site,
batch_size=batch_size,
delete_existing=delete_existing,
force_recompute=False,
database=database
)

logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")

Check failure

Code scanning / CodeQL

Log Injection High

This log entry depends on a
user-provided value
.

Copilot Autofix

AI about 2 months ago

To fix the problem, we should sanitize the user-provided site value before it is included in the log entry. Specifically, we should remove any carriage return (\r) and line feed (\n) characters, which are most often used in log injection attacks. We can do this using the replace() method as described in the background, applying both .replace('\n', '') and .replace('\r', '') (or combining them). This sanitization should be done before logging operations that interpolate or concatenate user input.

The relevant change is needed in the region of code before line 72, where the log statement is made. We should sanitize site after extracting it from the body and before it is used in logging statements (lines 60, 72, and possibly any other log sink in the handler). For completeness and future safety, it is best to sanitize site at the top after extraction and use the sanitized version throughout the function.

No additional dependencies are required. Only a code edit is needed.

Suggested changeset 1
code/python/webserver/routes/indexing.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/code/python/webserver/routes/indexing.py b/code/python/webserver/routes/indexing.py
--- a/code/python/webserver/routes/indexing.py
+++ b/code/python/webserver/routes/indexing.py
@@ -32,6 +32,8 @@
         
         data = body.get('data')
         site = body.get('site')
+        # Sanitize site input to prevent log injection
+        safe_site = site.replace('\r', '').replace('\n', '') if site else ''
         
         if not data or not site:
             return web.json_response({
@@ -57,7 +59,7 @@
                 
                 temp_path = temp_file.name
             
-            logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
+            logger.info(f"Starting JSON indexing: site={safe_site}, objects={len(data) if isinstance(data, list) else 1}")
             
             # Use loadJsonToDB with the temporary file
             total_documents = await loadJsonToDB(
@@ -69,7 +71,7 @@
                 database=database
             )
             
-            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
+            logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {safe_site}")
             
             # Return success=False if no documents were indexed
             success = total_documents > 0
EOF
@@ -32,6 +32,8 @@

data = body.get('data')
site = body.get('site')
# Sanitize site input to prevent log injection
safe_site = site.replace('\r', '').replace('\n', '') if site else ''

if not data or not site:
return web.json_response({
@@ -57,7 +59,7 @@

temp_path = temp_file.name

logger.info(f"Starting JSON indexing: site={site}, objects={len(data) if isinstance(data, list) else 1}")
logger.info(f"Starting JSON indexing: site={safe_site}, objects={len(data) if isinstance(data, list) else 1}")

# Use loadJsonToDB with the temporary file
total_documents = await loadJsonToDB(
@@ -69,7 +71,7 @@
database=database
)

logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {site}")
logger.info(f"JSON indexing completed: {total_documents} documents indexed for site {safe_site}")

# Return success=False if no documents were indexed
success = total_documents > 0
Copilot is powered by AI and may make mistakes. Always verify output.

# Return success=False if no documents were indexed
success = total_documents > 0

return web.json_response({
"success": success,
"message": f"Successfully indexed {total_documents} documents" if success else f"Failed to index documents - 0 documents processed",
"details": {
"total_documents": total_documents,
"site": site,
"input_objects": len(data) if isinstance(data, list) else 1,
"database": database or "default"
}
})

finally:
# Clean up temporary file
if temp_path and os.path.exists(temp_path):
try:
os.unlink(temp_path)
except Exception:
pass

except Exception as e:
logger.error(f"Error in JSON indexing: {e}", exc_info=True)
return web.json_response({
"success": False,
"error": str(e)
}, status=500)
13 changes: 6 additions & 7 deletions config/config_retrieval.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
write_endpoint: user_qdrant
write_endpoint: postgres

endpoints:

Expand Down Expand Up @@ -90,11 +90,10 @@ endpoints:
use_knn: false

postgres:
enabled: false
# Database connection details (i.e. "postgresql://<HOST>:<PORT>/<DATABASE>?user=<USERNAME>&sslmode=require")
enabled: true
# Database connection details
# api_endpoint: "postgres://postgres:postgres@localhost:5432/spc"
api_endpoint_env: POSTGRES_CONNECTION_STRING
# Password for authentication
api_key_env: POSTGRES_PASSWORD
# Index name to search in
index_name: documents
# Specify the database type
Expand All @@ -117,7 +116,7 @@ endpoints:

# Same Qdrant server, different collection for companies
company_qdrant:
enabled: true
enabled: false
# Connect to the same Qdrant server
api_endpoint_env: QDRANT_URL
# Same API key for authentication
Expand All @@ -129,7 +128,7 @@ endpoints:

# Same Qdrant server, different collection for users
user_qdrant:
enabled: true
enabled: false
# Connect to the same Qdrant server
api_endpoint_env: QDRANT_URL
# Same API key for authentication
Expand Down
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载