ArticleProcessor API Reference¶

Complete reference for the shared ArticleProcessor class used by all content sources.

ArticleProcessor¶

Processes articles from any content source (Wikipedia, web, files) with unified entity extraction.

Class Definition¶

from backend.kg_construction.article_processor import ArticleProcessor

class ArticleProcessor:
    """
    Shared processor for all content sources.

    Handles:
    - Entity extraction (LLM or heuristic)
    - Relationship identification
    - Vector embedding generation
    - Graph node and edge creation
    """

Constructor¶

def __init__(
    self,
    conn: kuzu.Connection,
    use_llm: bool = True,
    max_entities: int = 50,
    extract_relationships: bool = True
)

Parameters:

conn (kuzu.Connection, required): LadybugDB database connection
use_llm (bool, default: True): Use LLM extraction (True) or heuristic extraction (False)
max_entities (int, default: 50): Maximum entities to extract per article
extract_relationships (bool, default: True): Extract relationships between entities

Example:

import real_ladybug as kuzu

db = kuzu.Database("knowledge.db")
conn = kuzu.Connection(db)

# LLM extraction with relationships
processor = ArticleProcessor(conn, use_llm=True, extract_relationships=True)

# Fast heuristic extraction without relationships
processor_fast = ArticleProcessor(conn, use_llm=False, extract_relationships=False)

Methods¶

process_article()¶

def process_article(
    self,
    title: str,
    content: str,
    url: str
) -> Dict[str, Any]:
    """
    Process a single article and add to knowledge graph.

    Args:
        title: Article title
        content: Article text content
        url: Article URL (for deduplication)

    Returns:
        Dict with extraction statistics:
            {
                "entities_count": int,
                "relationships_count": int,
                "processing_time_ms": float
            }

    Raises:
        ValueError: If title or content is empty
        openai.error.OpenAIError: If LLM extraction fails

    Example:
        >>> stats = processor.process_article(
        ...     title="Azure Kubernetes Service",
        ...     content="AKS is a managed container orchestration...",
        ...     url="https://learn.microsoft.com/en-us/azure/aks/what-is-aks"
        ... )
        >>> print(stats)
        {'entities_count': 42, 'relationships_count': 28, 'processing_time_ms': 3214.5}
    """

extract_entities()¶

def extract_entities(
    self,
    title: str,
    content: str
) -> List[Entity]:
    """
    Extract named entities from article content.

    Args:
        title: Article title (used as context)
        content: Article text content

    Returns:
        List of Entity objects with name and type

    Extraction method depends on use_llm setting:
        - use_llm=True: Uses OpenAI to identify entities
        - use_llm=False: Uses regex patterns and NER heuristics

    Example:
        >>> entities = processor.extract_entities(
        ...     title="Azure Kubernetes Service",
        ...     content="AKS is a managed Kubernetes service..."
        ... )
        >>> for entity in entities[:3]:
        ...     print(f"{entity.name} ({entity.type})")
        Azure Kubernetes Service (TECHNOLOGY)
        Kubernetes (TECHNOLOGY)
        Azure (PLATFORM)
    """

extract_relationships()¶

def extract_relationships(
    self,
    title: str,
    content: str,
    entities: List[Entity]
) -> List[Relationship]:
    """
    Extract relationships between entities.

    Args:
        title: Article title (context)
        content: Article text content
        entities: Previously extracted entities

    Returns:
        List of Relationship objects with source, relation, target

    Only available when use_llm=True and extract_relationships=True.

    Example:
        >>> relationships = processor.extract_relationships(
        ...     title="Azure Kubernetes Service",
        ...     content="AKS manages Kubernetes clusters...",
        ...     entities=entities
        ... )
        >>> for rel in relationships[:3]:
        ...     print(f"{rel.source} --[{rel.relation}]--> {rel.target}")
        AKS --[MANAGES]--> Kubernetes clusters
        Kubernetes --[RUNS_ON]--> Azure
        AKS --[IS_A]--> managed service
    """

create_section_node()¶

def create_section_node(
    self,
    title: str,
    content: str,
    url: str,
    parent_url: Optional[str] = None
) -> str:
    """
    Create Section node in knowledge graph.

    Args:
        title: Section title
        content: Section content (for embedding)
        url: Section URL (unique identifier)
        parent_url: Parent article URL (for hierarchy)

    Returns:
        Section URL (node identifier)

    Creates:
        - Section node with title, URL, embedding
        - PART_OF edge to parent article (if parent_url provided)

    Example:
        >>> section_url = processor.create_section_node(
        ...     title="AKS Overview",
        ...     content="Azure Kubernetes Service provides...",
        ...     url="https://learn.microsoft.com/en-us/azure/aks/what-is-aks#overview",
        ...     parent_url="https://learn.microsoft.com/en-us/azure/aks/what-is-aks"
        ... )
        >>> print(section_url)
        https://learn.microsoft.com/en-us/azure/aks/what-is-aks#overview
    """

create_entity_node()¶

def create_entity_node(
    self,
    entity: Entity,
    source_url: str
) -> None:
    """
    Create Entity node in knowledge graph.

    Args:
        entity: Entity object with name and type
        source_url: URL of article containing entity

    Creates:
        - Entity node with name, type
        - MENTIONED_IN edge to source Section

    Handles duplicates by merging (same entity name across articles).

    Example:
        >>> entity = Entity(name="Kubernetes", type="TECHNOLOGY")
        >>> processor.create_entity_node(
        ...     entity=entity,
        ...     source_url="https://learn.microsoft.com/en-us/azure/aks/what-is-aks"
        ... )
    """

create_relationship_edge()¶

def create_relationship_edge(
    self,
    relationship: Relationship
) -> None:
    """
    Create relationship edge between entities.

    Args:
        relationship: Relationship object with source, relation, target

    Creates edge between Entity nodes with specified relation type.

    Example:
        >>> rel = Relationship(
        ...     source="AKS",
        ...     relation="MANAGES",
        ...     target="Kubernetes clusters"
        ... )
        >>> processor.create_relationship_edge(rel)
    """

Data Models¶

Entity¶

from dataclasses import dataclass

@dataclass
class Entity:
    """
    Represents a named entity extracted from text.

    Attributes:
        name: Entity name (normalized)
        type: Entity type (PERSON, ORGANIZATION, TECHNOLOGY, CONCEPT, etc.)
    """
    name: str
    type: str

Entity Types:

PERSON - People, authors, developers
ORGANIZATION - Companies, projects, teams
TECHNOLOGY - Tools, frameworks, services
CONCEPT - Abstract ideas, methodologies
LOCATION - Places, regions, data centers
PRODUCT - Software products, services
EVENT - Conferences, releases, incidents

Example:

entities = [
    Entity(name="Azure Kubernetes Service", type="TECHNOLOGY"),
    Entity(name="Microsoft", type="ORGANIZATION"),
    Entity(name="containerization", type="CONCEPT")
]

Relationship¶

from dataclasses import dataclass

@dataclass
class Relationship:
    """
    Represents a semantic relationship between entities.

    Attributes:
        source: Source entity name
        relation: Relationship type (verb-like)
        target: Target entity name
    """
    source: str
    relation: str
    target: str

Common Relations:

IS_A - Type/category relationship
PART_OF - Component relationship
USES - Dependency relationship
MANAGES - Control relationship
PROVIDES - Service relationship
RUNS_ON - Platform relationship
DEVELOPED_BY - Authorship relationship

Example:

relationships = [
    Relationship(source="AKS", relation="IS_A", target="managed service"),
    Relationship(source="AKS", relation="MANAGES", target="Kubernetes"),
    Relationship(source="Kubernetes", relation="RUNS_ON", target="Azure")
]

LLM Extraction Pipeline¶

When use_llm=True, extraction follows this pipeline:

1. Entity Extraction Prompt¶

Given the following article, extract all named entities:

Title: {title}
Content: {content}

Extract entities in these categories:
- PERSON (people, authors)
- ORGANIZATION (companies, projects)
- TECHNOLOGY (tools, frameworks, services)
- CONCEPT (abstract ideas)
- LOCATION (places, regions)
- PRODUCT (software products)

Return as JSON array: [{"name": "...", "type": "..."}]

2. Relationship Extraction Prompt¶

Given these entities from the article, identify relationships:

Entities: {entities}
Content: {content}

Extract relationships as JSON array:
[{"source": "...", "relation": "...", "target": "..."}]

Use relation types: IS_A, PART_OF, USES, MANAGES, PROVIDES, RUNS_ON

3. Entity Normalization¶

After extraction, entities are normalized:

def normalize_entity(name: str) -> str:
    """
    Normalize entity name for consistency.

    - Remove extra whitespace
    - Title case for proper nouns
    - Expand common abbreviations
    - Remove parenthetical notes

    Example:
        >>> normalize_entity("azure kubernetes service (AKS)")
        "Azure Kubernetes Service"
    """

4. Vector Embedding¶

Each entity and section gets an embedding:

from openai import OpenAI

client = OpenAI()
response = client.embeddings.create(
    model="text-embedding-ada-002",
    input=text
)
embedding = response.data[0].embedding

Heuristic Extraction (use_llm=False)¶

When LLM extraction is disabled, uses pattern-based extraction:

Entity Patterns¶

ENTITY_PATTERNS = {
    "TECHNOLOGY": r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b",  # Proper nouns
    "CONCEPT": r"\b(?:pattern|principle|methodology|approach)\b",
    "ORGANIZATION": r"\b(?:Microsoft|Google|Amazon|IBM)\b",
}

Relationship Heuristics¶

Co-occurrence in same sentence → weak relationship
Verb phrases between entities → relation type
No LLM means fewer, less accurate relationships

Performance:

10x faster than LLM extraction
50-70% entity recall vs LLM
Minimal relationship extraction

Configuration¶

Environment Variables¶

Variable	Description	Default
`OPENAI_API_KEY`	OpenAI API key	None (required)
`OPENAI_MODEL`	LLM model	`gpt-4-turbo-preview`
`LLM_TEMPERATURE`	Sampling temperature	`0.0`
`LLM_MAX_RETRIES`	Retry attempts	`3`
`LLM_RETRY_DELAY`	Retry delay (seconds)	`1.0`
`EMBEDDING_MODEL`	Embedding model	`text-embedding-ada-002`

Constructor Parameters¶

# High-quality extraction (default)
processor = ArticleProcessor(conn, use_llm=True, max_entities=50, extract_relationships=True)

# Fast extraction for large crawls
processor = ArticleProcessor(conn, use_llm=True, max_entities=30, extract_relationships=False)

# Heuristic extraction (no API cost)
processor = ArticleProcessor(conn, use_llm=False, max_entities=100, extract_relationships=False)

Performance Characteristics¶

Time Complexity¶

Entity extraction: O(n) where n = content length
Relationship extraction: O(e²) where e = entity count
Graph insertion: O(e + r) where r = relationship count

API Cost (LLM Extraction)¶

Per article with GPT-4-turbo-preview:

Operation	Tokens	Cost
Entity extraction	~2,000	$0.02
Relationship extraction	~1,500	$0.015
Embeddings (50 entities)	~500	$0.0001
Total per article	~4,000	$0.035

Benchmarks¶

Measured on Azure AKS documentation article (3,500 words):

Configuration	Entities	Relationships	Time	Cost
LLM + relationships	42	28	3.2s	$0.035
LLM - relationships	42	0	1.8s	$0.020
Heuristic	28	0	0.3s	$0.0001

Error Handling¶

Common Exceptions¶

# Empty content
try:
    processor.process_article(title="", content="", url="...")
except ValueError as e:
    print(f"Invalid input: {e}")

# LLM failure
try:
    processor.extract_entities(title, content)
except openai.error.RateLimitError:
    print("Rate limit exceeded, retrying...")

# Database error
try:
    processor.create_entity_node(entity, url)
except kuzu.Exception as e:  # kuzu aliased from real_ladybug
    print(f"Database error: {e}")

Retry Logic¶

LLM calls automatically retry on failure:

@retry(
    max_attempts=int(os.getenv("LLM_MAX_RETRIES", "3")),
    delay=float(os.getenv("LLM_RETRY_DELAY", "1.0")),
    backoff=2.0
)
def call_llm(prompt: str) -> str:
    return openai_client.chat.completions.create(...)

Integration Examples¶

With WebContentSource¶

from backend.sources.web_content_source import WebContentSource
from backend.kg_construction.article_processor import ArticleProcessor
import real_ladybug as kuzu

# Setup
db = kuzu.Database("azure_docs.db")
conn = kuzu.Connection(db)
processor = ArticleProcessor(conn, use_llm=True)

# Create source
source = WebContentSource(
    url="https://learn.microsoft.com/en-us/azure/aks/what-is-aks",
    max_depth=2,
    max_links=25
)

# Process articles
for article in source.get_articles():
    stats = processor.process_article(
        title=article.title,
        content=article.content,
        url=article.url
    )
    print(f"Processed {article.title}: {stats['entities_count']} entities")

With WikipediaContentSource¶

from backend.sources.wikipedia_content_source import WikipediaContentSource

source = WikipediaContentSource(title="Kubernetes")

for article in source.get_articles():
    processor.process_article(
        title=article.title,
        content=article.content,
        url=article.url
    )

Batch Processing¶

from concurrent.futures import ThreadPoolExecutor

def process_url(url: str) -> Dict[str, Any]:
    source = WebContentSource(url=url)
    for article in source.get_articles():
        return processor.process_article(article.title, article.content, article.url)

urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
]

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(process_url, urls))

print(f"Processed {len(results)} articles")

ArticleProcessor API Reference¶