"""
Semantic Search & Fuzzy Matching for Real Estate Extraction
============================================================
Uses spaCy word embeddings for similarity-based matching.
Handles typos, synonyms, and variations automatically.

Version: 3.1
Dependencies: spaCy with word vectors (pl_core_news_lg)
"""

import re
import logging
from typing import List, Optional, Tuple, Set
from difflib import SequenceMatcher

try:
    import spacy
    from spacy.tokens import Doc, Token
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False

logger = logging.getLogger(__name__)


class SemanticMatcher:
    """
    Semantic matching using spaCy word embeddings.
    
    Features:
    - Fuzzy string matching (handles typos)
    - Semantic similarity (handles synonyms)
    - Context-aware extraction
    - Negation detection
    
    Example:
        matcher = SemanticMatcher(nlp)
        
        # Exact match
        matcher.match("garaż", ["garaż"])  # → True
        
        # Typo handling
        matcher.match("gararz", ["garaż"])  # → True (fuzzy)
        
        # Synonym detection
        matcher.match("parking", ["garaż"])  # → True (semantic)
    """
    
    def __init__(self, nlp=None, use_semantic: bool = True):
        """
        Initialize semantic matcher.
        
        Args:
            nlp: spaCy language model (with word vectors)
            use_semantic: Enable semantic similarity (requires word vectors)
        """
        self.nlp = nlp
        self.use_semantic = use_semantic and nlp is not None
        
        # Check if model has word vectors
        if self.use_semantic:
            try:
                test_doc = nlp("test")
                if not test_doc[0].has_vector:
                    logger.warning("spaCy model has no word vectors. Semantic matching disabled.")
                    self.use_semantic = False
            except Exception as e:
                logger.warning(f"Could not verify word vectors: {e}")
                self.use_semantic = False
        
        # Fuzzy matching thresholds
        self.fuzzy_threshold = 0.75  # String similarity threshold (lowered for typo tolerance)
        self.semantic_threshold = 0.70  # Word embedding similarity threshold (lowered for better synonym detection)
    
    # ========================================================================
    # CORE MATCHING METHODS
    # ========================================================================
    
    def match(self, text: str, keywords: List[str], threshold: Optional[float] = None) -> bool:
        """
        Check if any keyword matches text using multiple strategies.
        
        Strategies (in order):
        1. Exact match (fastest)
        2. Fuzzy match (handles typos)
        3. Semantic match (handles synonyms)
        
        Args:
            text: Text to search in
            keywords: List of keywords to match
            threshold: Custom similarity threshold (overrides defaults)
        
        Returns:
            True if any keyword matches
        """
        text = text.lower().strip()
        
        for keyword in keywords:
            keyword = keyword.lower().strip()
            
            # Strategy 1: Exact match
            if keyword in text:
                return True
            
            # Strategy 2: Fuzzy match (typos)
            if self._fuzzy_match(text, keyword, threshold or self.fuzzy_threshold):
                return True
            
            # Strategy 3: Semantic match (synonyms)
            if self.use_semantic:
                if self._semantic_match(text, keyword, threshold or self.semantic_threshold):
                    return True
        
        return False
    
    def find_best_match(self, text: str, keywords: List[str]) -> Optional[Tuple[str, float]]:
        """
        Find best matching keyword with confidence score.
        
        Args:
            text: Text to search in
            keywords: List of candidate keywords
        
        Returns:
            Tuple of (keyword, confidence) or None
        """
        text = text.lower().strip()
        best_match = None
        best_score = 0.0
        
        for keyword in keywords:
            keyword = keyword.lower().strip()
            
            # Try all matching strategies
            scores = []
            
            # Exact match
            if keyword in text:
                scores.append(1.0)
            
            # Fuzzy match
            fuzzy_score = self._get_fuzzy_score(text, keyword)
            if fuzzy_score > self.fuzzy_threshold:
                scores.append(fuzzy_score)
            
            # Semantic match
            if self.use_semantic:
                semantic_score = self._get_semantic_score(text, keyword)
                if semantic_score > self.semantic_threshold:
                    scores.append(semantic_score)
            
            # Use best score
            if scores:
                score = max(scores)
                if score > best_score:
                    best_score = score
                    best_match = keyword
        
        return (best_match, best_score) if best_match else None
    
    def match_with_context(self, text: str, keywords: List[str], 
                          context_keywords: List[str], 
                          window: int = 30) -> bool:
        """
        Match keyword only if appears near context keywords.
        
        Example:
            text = "Dom ma dobry stan techniczny i świetną lokalizację"
            keywords = ["dobry"]
            context_keywords = ["stan"]
            → Returns True (dobry near stan)
            
            text = "Dom w dobrej lokalizacji, stan do remontu"
            keywords = ["dobry"]
            context_keywords = ["stan"]
            → Returns False (dobry NOT near stan)
        
        Args:
            text: Text to search
            keywords: Keywords to find
            context_keywords: Must appear near keywords
            window: Maximum distance in characters
        
        Returns:
            True if keyword found near context
        """
        text = text.lower()
        
        for keyword in keywords:
            keyword = keyword.lower()
            
            # Find all occurrences of keyword
            for match_start in self._find_all_occurrences(text, keyword):
                match_end = match_start + len(keyword)
                
                # Get surrounding context
                context_start = max(0, match_start - window)
                context_end = min(len(text), match_end + window)
                context = text[context_start:context_end]
                
                # Check if any context keyword appears
                for ctx_kw in context_keywords:
                    if ctx_kw.lower() in context:
                        return True
        
        return False
    
    def is_negated(self, text: str, keyword: str, window: int = 30) -> bool:
        """
        Check if keyword appears in negated context.
        
        Example:
            "Dom nie ma garażu" → is_negated("...", "garaż") = True
            "Dom ma garaż" → is_negated("...", "garaż") = False
        
        Args:
            text: Text to analyze
            keyword: Keyword to check
            window: Characters before keyword to check
        
        Returns:
            True if negation detected
        """
        text = text.lower()
        keyword = keyword.lower()
        
        negation_patterns = [
            'nie', 'bez', 'brak', 'nie ma', 'nieposiada',
            'nie posiada', 'nie zawiera', 'brak jest'
        ]
        
        # Find keyword position
        pos = text.find(keyword)
        if pos == -1:
            return False
        
        # Check preceding context
        context_start = max(0, pos - window)
        context = text[context_start:pos]
        
        # Look for negation
        for neg in negation_patterns:
            if neg in context:
                return True
        
        return False
    
    # ========================================================================
    # INTERNAL MATCHING STRATEGIES
    # ========================================================================
    
    def _fuzzy_match(self, text: str, keyword: str, threshold: float) -> bool:
        """Check fuzzy string similarity (handles typos)."""
        score = self._get_fuzzy_score(text, keyword)
        return score >= threshold
    
    def _get_fuzzy_score(self, text: str, keyword: str) -> float:
        """Calculate fuzzy string similarity using SequenceMatcher."""
        # Check if keyword appears as substring (even with small variations)
        words = text.split()
        
        best_score = 0.0
        for word in words:
            # Compare whole word
            score = SequenceMatcher(None, word, keyword).ratio()
            best_score = max(best_score, score)
            
            # Also check if keyword is substring
            if keyword in word or word in keyword:
                overlap = len(keyword) if keyword in word else len(word)
                score = overlap / max(len(word), len(keyword))
                best_score = max(best_score, score)
        
        return best_score
    
    def _semantic_match(self, text: str, keyword: str, threshold: float) -> bool:
        """Check semantic similarity using word embeddings."""
        if not self.use_semantic or not self.nlp:
            return False
        
        score = self._get_semantic_score(text, keyword)
        return score >= threshold
    
    def _get_semantic_score(self, text: str, keyword: str) -> float:
        """Calculate semantic similarity using spaCy word vectors."""
        if not self.use_semantic or not self.nlp:
            return 0.0
        
        try:
            # Process both texts
            text_doc = self.nlp(text)
            keyword_doc = self.nlp(keyword)
            
            # Find best matching token in text
            best_similarity = 0.0
            
            for token in text_doc:
                # Skip punctuation and whitespace
                if token.is_punct or token.is_space:
                    continue
                
                # Compare with keyword
                for kw_token in keyword_doc:
                    if kw_token.is_punct or kw_token.is_space:
                        continue
                    
                    # Check if both have vectors
                    if token.has_vector and kw_token.has_vector:
                        similarity = token.similarity(kw_token)
                        best_similarity = max(best_similarity, similarity)
            
            return best_similarity
            
        except Exception as e:
            logger.debug(f"Semantic matching error: {e}")
            return 0.0
    
    def _find_all_occurrences(self, text: str, keyword: str) -> List[int]:
        """Find all starting positions of keyword in text."""
        positions = []
        start = 0
        
        while True:
            pos = text.find(keyword, start)
            if pos == -1:
                break
            positions.append(pos)
            start = pos + 1
        
        return positions
    
    # ========================================================================
    # BATCH OPERATIONS
    # ========================================================================
    
    def match_any(self, text: str, keyword_groups: List[List[str]]) -> Optional[str]:
        """
        Try multiple keyword groups, return first matching group name.
        
        Example:
            groups = {
                'garage': ['garaż', 'garażem', 'parking'],
                'garden': ['ogród', 'działka', 'teren zielony']
            }
            text = "Dom z parkingiem podziemnym"
            → Returns 'garage'
        
        Args:
            text: Text to search
            keyword_groups: Dict of {group_name: [keywords]}
        
        Returns:
            Name of first matching group, or None
        """
        for group_name, keywords in keyword_groups.items():
            if self.match(text, keywords):
                return group_name
        return None
    
    def extract_matched_values(self, text: str, value_map: dict) -> List[str]:
        """
        Extract all values whose keywords match in text.
        
        Example:
            value_map = {
                'balcony': ['balkon', 'balkony'],
                'garage': ['garaż', 'parking'],
                'garden': ['ogród', 'działka']
            }
            text = "Dom z balkonem i garażem"
            → Returns ['balcony', 'garage']
        
        Args:
            text: Text to search
            value_map: Dict of {value: [keywords]}
        
        Returns:
            List of matching values
        """
        matches = []
        
        for value, keywords in value_map.items():
            if self.match(text, keywords):
                if not self.is_negated(text, keywords[0]):  # Check first keyword
                    matches.append(value)
        
        return matches


class ContextAwareExtractor:
    """
    Context-aware extraction using semantic matching.
    
    Improves extraction accuracy by considering surrounding context.
    """
    
    def __init__(self, nlp=None):
        """
        Initialize context-aware extractor.
        
        Args:
            nlp: spaCy language model
        """
        self.matcher = SemanticMatcher(nlp, use_semantic=True)
        self.nlp = nlp
    
    def extract_with_context(self, text: str, 
                            extraction_patterns: dict,
                            context_requirements: dict) -> dict:
        """
        Extract values only when context requirements are met.
        
        Example:
            extraction_patterns = {
                'condition': {
                    'dobry': ['dobry', 'dobry stan'],
                    'do remontu': ['do remontu', 'wymaga remontu']
                }
            }
            
            context_requirements = {
                'condition': ['stan', 'techniczny', 'budynku']
            }
            
            text = "Dom w dobrej lokalizacji, stan techniczny do remontu"
            → {'condition': 'do remontu'}  # Extracted because "stan" context found
        
        Args:
            text: Text to extract from
            extraction_patterns: {field: {value: [keywords]}}
            context_requirements: {field: [context_keywords]}
        
        Returns:
            Dict of extracted {field: value}
        """
        results = {}
        
        for field, value_patterns in extraction_patterns.items():
            context_keywords = context_requirements.get(field, [])
            
            for canonical_value, keywords in value_patterns.items():
                # Check if keywords match
                if self.matcher.match(text, keywords):
                    # If context required, check it
                    if context_keywords:
                        if self.matcher.match_with_context(text, keywords, context_keywords):
                            # Check not negated
                            if not self.matcher.is_negated(text, keywords[0]):
                                results[field] = canonical_value
                                break
                    else:
                        # No context required, just check not negated
                        if not self.matcher.is_negated(text, keywords[0]):
                            results[field] = canonical_value
                            break
        
        return results
    
    def disambiguate(self, text: str, field: str, candidates: dict) -> Optional[str]:
        """
        Disambiguate between multiple possible values using context.
        
        Example:
            text = "Stan techniczny dobry, stan prawny do sprawdzenia"
            field = "condition"
            candidates = {
                'dobry': ['dobry', 'w dobrym stanie'],
                'do sprawdzenia': ['do sprawdzenia', 'wymaga sprawdzenia']
            }
            
            # Looks for "stan techniczny" context
            → Returns 'dobry'  # Because "dobry" appears near "stan techniczny"
        
        Args:
            text: Text to analyze
            field: Field being extracted
            candidates: {value: [keywords]} candidates
        
        Returns:
            Best matching value, or None
        """
        # Field-specific context keywords
        field_contexts = {
            'condition': ['stan', 'techniczny', 'budynku', 'mieszkania'],
            'heating': ['ogrzewanie', 'c.o.', 'ciepło', 'grzewcz'],
            'material': ['wykonany', 'zbudowany', 'materiał', 'budow'],
            'market': ['rynek', 'sprzedaż', 'oferta'],
            'ownership': ['własność', 'prawo', 'tytuł'],
        }
        
        context_kws = field_contexts.get(field, [])
        
        best_match = None
        best_score = 0.0
        
        for value, keywords in candidates.items():
            # Calculate match score
            score = 0.0
            
            # Check if keywords match
            if self.matcher.match(text, keywords):
                score += 0.5
                
                # Bonus for context proximity
                if context_kws:
                    if self.matcher.match_with_context(text, keywords, context_kws, window=50):
                        score += 0.3
                
                # Penalty for negation
                if self.matcher.is_negated(text, keywords[0]):
                    score -= 0.8
                
                # Bonus for longer/more specific keywords
                max_keyword_len = max(len(kw) for kw in keywords)
                if max_keyword_len > 10:
                    score += 0.1
                
                if score > best_score:
                    best_score = score
                    best_match = value
        
        return best_match if best_score > 0.5 else None


# ============================================================================
# CONVENIENCE FUNCTIONS
# ============================================================================

def create_semantic_matcher(nlp=None) -> SemanticMatcher:
    """Factory function to create semantic matcher."""
    if nlp is None and SPACY_AVAILABLE:
        try:
            import spacy
            nlp = spacy.load("pl_core_news_lg")
        except:
            logger.warning("Could not load spaCy model. Semantic matching disabled.")
    
    return SemanticMatcher(nlp)


def create_context_extractor(nlp=None) -> ContextAwareExtractor:
    """Factory function to create context-aware extractor."""
    if nlp is None and SPACY_AVAILABLE:
        try:
            import spacy
            nlp = spacy.load("pl_core_news_lg")
        except:
            logger.warning("Could not load spaCy model. Context extraction disabled.")
    
    return ContextAwareExtractor(nlp)