+
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions code/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class LLMProviderConfig:
endpoint: Optional[str] = None
api_version: Optional[str] = None

@dataclass
class EmbeddingModeConfig:
default: str = "single"
document_types: Dict[str, str] = field(default_factory=dict)

@dataclass
class EmbeddingProviderConfig:
api_key: Optional[str] = None
Expand Down Expand Up @@ -201,6 +206,13 @@ def load_embedding_config(self, path: str = "config_embedding.yaml"):

self.preferred_embedding_provider: str = data["preferred_provider"]
self.embedding_providers: Dict[str, EmbeddingProviderConfig] = {}

# Load embedding mode configuration
embedding_mode_data = data.get("embedding_mode", {})
self.embedding_mode = EmbeddingModeConfig(
default=embedding_mode_data.get("default", "single"),
document_types=embedding_mode_data.get("document_types", {})
)

for name, cfg in data.get("providers", {}).items():
# Extract configuration values from the YAML
Expand Down Expand Up @@ -517,6 +529,18 @@ def get_llm_provider(self, provider_name: Optional[str] = None) -> Optional[LLMP
return self.llm_endpoints[self.preferred_llm_endpoint]

return None

def get_embedding_mode(self, document_type: str) -> str:
"""Get the embedding mode for a specific document type."""
if not hasattr(self, 'embedding_mode'):
return "single" # Default fallback

# Check if there's a specific mode for this document type
if document_type in self.embedding_mode.document_types:
return self.embedding_mode.document_types[document_type]

# Fall back to default mode
return self.embedding_mode.default

# Global singleton
CONFIG = AppConfig()
11 changes: 11 additions & 0 deletions code/config/config_embedding.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
preferred_provider: openai

# Embedding generation mode configuration
embedding_mode:
# Default mode for documents (single or multi)
default: single

# Per-document-type configuration
# Set to 'multi' to generate multiple specialized embeddings per document
# Set to 'single' to generate one embedding per document (legacy behavior)
document_types:
company: multi

providers:
openai:
api_key_env: OPENAI_API_KEY
Expand Down
13 changes: 9 additions & 4 deletions code/scraping/incrementalCrawlAndLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,9 +323,9 @@ async def _process_single_url(self, url: str) -> bool:
else:
self.stats["schema_types"][schema_type] = 1

# Step 3: Prepare documents for database
# Step 3: Prepare documents for database with specialized embedding texts
documents_to_upload = []
docs, _ = prepare_documents_from_json(final_url, schemas_str, self.db_name)
docs, embedding_texts = prepare_documents_from_json(final_url, schemas_str, self.db_name)
documents_to_upload.extend(docs)

# Step 4: Generate embeddings and upload
Expand All @@ -335,8 +335,13 @@ async def _process_single_url(self, url: str) -> bool:
provider_config = CONFIG.get_embedding_provider(provider)
model = provider_config.model if provider_config else None

# Extract texts for embedding
texts = [doc["schema_json"] for doc in documents_to_upload]
# Use specialized embedding texts if available, otherwise fall back to schema_json
if embedding_texts and len(embedding_texts) == len(documents_to_upload):
texts = embedding_texts
logger.debug(f"Using {len(texts)} specialized embedding texts from multi-embedding generator")
else:
texts = [doc["schema_json"] for doc in documents_to_upload]
logger.debug(f"Falling back to schema_json for {len(texts)} documents (multi-embedding texts not available)")

# Generate embeddings
embeddings = await batch_get_embeddings(texts, provider, model)
Expand Down
24 changes: 18 additions & 6 deletions code/tools/db_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,9 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
if json_only_format:
print("Detected JSON-only format. URLs will be extracted from within the JSON data.")

# Track both documents and their specialized embedding texts
all_embedding_texts = []

# Process each line to extract documents
for line in lines:
try:
Expand All @@ -830,9 +833,10 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
if url is None or json_data is None:
continue

# Prepare documents
documents, _ = prepare_documents_from_json(url, json_data, site)
# Prepare documents with specialized embedding texts
documents, embedding_texts = prepare_documents_from_json(url, json_data, site)
all_documents.extend(documents)
all_embedding_texts.extend(embedding_texts)
except Exception as e:
print(f"Error processing line: {str(e)}")
continue
Expand All @@ -844,8 +848,13 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_

# Open file to write documents with embeddings
with open(embeddings_path, 'w', encoding='utf-8') as embed_file:
# Extract texts for embedding
texts = [doc["schema_json"] for doc in all_documents]
# Use specialized embedding texts if available, otherwise fall back to schema_json
if all_embedding_texts and len(all_embedding_texts) == len(all_documents):
texts = all_embedding_texts
print(f"Using {len(texts)} specialized embedding texts from multi-embedding generator")
else:
texts = [doc["schema_json"] for doc in all_documents]
print(f"Falling back to schema_json for {len(texts)} documents (multi-embedding texts not available)")

# Process in batches
total_documents = 0
Expand Down Expand Up @@ -877,8 +886,11 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
# Ensure JSON has no newlines
doc_json = doc['schema_json'].replace('\n', ' ')

# Write to embeddings file
embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\n")
# Write to embeddings file in new multi-embedding format
# Format: URL \t JSON \t embedding \t base_doc_id \t embedding_type
base_doc_id = doc.get('base_doc_id', '')
embedding_type = doc.get('embedding_type', '')
embed_file.write(f"{doc['url']}\t{doc_json}\t{embedding_str}\t{base_doc_id}\t{embedding_type}\n")

docs_with_embeddings.append(doc)

Expand Down
112 changes: 89 additions & 23 deletions code/tools/db_load_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,11 @@ def get_item_name(item: Dict[str, Any]) -> str:

def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[List[Dict[str, Any]], List[str]]:
"""
Prepare documents from URL and JSON data.
Prepare documents from URL and JSON data using configurable embedding approach.

Uses configuration to determine embedding mode per document type:
- Single mode: Creates one embedding per document (legacy behavior)
- Multi mode: Creates multiple specialized embeddings per document for better search

Args:
url: URL for the item
Expand All @@ -167,6 +171,10 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
Tuple of (documents, texts_for_embedding)
"""
try:
# Import dependencies
from tools.multi_embedding_generator import generate_document_embeddings, detect_item_type
from config.config import CONFIG

# Parse and trim the JSON
json_obj = json.loads(json_data)
trimmed_json = trim_schema_json(json_obj, site)
Expand All @@ -189,17 +197,40 @@ def prepare_documents_from_json(url: str, json_data: str, site: str) -> Tuple[Li
item_url = url if i == 0 else f"{url}#{i}"
item_json = json.dumps(item)

# Add document to batch
doc = {
"id": str(int64_hash(item_url)),
"schema_json": item_json,
"url": item_url,
"name": get_item_name(item),
"site": site
}
# Determine embedding mode based on configuration
item_type = detect_item_type(item)
embedding_mode = CONFIG.get_embedding_mode(item_type)

documents.append(doc)
texts.append(item_json)
if embedding_mode == "multi":
# Generate multiple specialized embeddings for this item
multi_embeddings = generate_document_embeddings(item, item_url, site)

# Create a document for each specialized embedding
for emb_data in multi_embeddings:
doc = {
"id": emb_data["id"],
"base_doc_id": emb_data["base_doc_id"],
"embedding_type": emb_data["embedding_type"],
"schema_json": item_json,
"url": item_url,
"name": get_item_name(item),
"site": site
}

documents.append(doc)
texts.append(emb_data["embedding_text"])
else:
# Single embedding mode (legacy behavior)
doc = {
"id": str(int64_hash(item_url)),
"schema_json": item_json,
"url": item_url,
"name": get_item_name(item),
"site": site
}

documents.append(doc)
texts.append(item_json)

return documents, texts
except Exception as e:
Expand All @@ -210,17 +241,38 @@ def documents_from_csv_line(line, site):
"""
Parse a line with URL, JSON, and embedding into document objects.

Supports both legacy format (single embedding) and new multi-embedding format:
- Legacy: URL \t JSON \t embedding
- Multi: URL \t JSON \t embedding \t base_doc_id \t embedding_type

Args:
line: Tab-separated line with URL, JSON, and embedding
line: Tab-separated line with URL, JSON, and embedding data
site: Site identifier

Returns:
List of document objects
"""
try:
url, json_data, embedding_str = line.strip().split('\t')
parts = line.strip().split('\t')

if len(parts) < 3:
print(f"Error: Line has insufficient columns ({len(parts)} < 3)")
return []

url = parts[0]
json_data = parts[1]
embedding_str = parts[2]

# Check if this is the new multi-embedding format
is_multi_format = len(parts) >= 5
base_doc_id = parts[3] if is_multi_format else None
embedding_type = parts[4] if is_multi_format else None

# Parse embedding
embedding_str = embedding_str.replace("[", "").replace("]", "")
embedding = [float(x) for x in embedding_str.split(',')]

# Parse and trim JSON
js = json.loads(json_data)
js = trim_schema_json(js, site)
except Exception as e:
Expand All @@ -240,19 +292,33 @@ def documents_from_csv_line(line, site):
if item is None:
continue

# No longer filtering by should_include_item - trimming already handles this
# URL handling
item_url = url if i == 0 else f"{url}#{i}"
name = get_item_name(item)

# Ensure no None values in the document
doc = {
"id": str(int64_hash(item_url)),
"embedding": embedding,
"schema_json": json.dumps(item),
"url": item_url or "",
"name": name or "Unnamed Item",
"site": site or "unknown"
}
# Create document based on format
if is_multi_format:
# New multi-embedding format
doc = {
"id": f"{base_doc_id}_{embedding_type}", # Use the stored multi-embedding ID
"base_doc_id": base_doc_id, # Link back to original document
"embedding_type": embedding_type, # Type of this embedding
"embedding": embedding,
"schema_json": json.dumps(item),
"url": item_url or "",
"name": name or "Unnamed Item",
"site": site or "unknown"
}
else:
# Legacy single embedding format
doc = {
"id": str(int64_hash(item_url)),
"embedding": embedding,
"schema_json": json.dumps(item),
"url": item_url or "",
"name": name or "Unnamed Item",
"site": site or "unknown"
}

# Additional validation to ensure no None values
for key, value in doc.items():
Expand Down
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载