#!/usr/bin/env python3
# ================================================================================
# HIWOSY™ MEMORY LAYER  V R C R
# ================================================================================
# Self-Learning Semantic Deduplication Technology
#
# Patent Pending: USPTO 63/911,048, 63/915,918, 63/921,878
# Copyright Cases: 1-15035049381, 1-15034439898
# Contact: Ljubisa Kovacevic | kovalubo@gmail.com
# Website: https://www.hiwosy.com
# ================================================================================
"""
Hiwosy™ API Test Script - Enterprise Edition
=============================================
Test your data against the Hiwosy Unified API (3 Products • 1 API Key)

Usage:
    python test_my_data.py your_file.csv
    python test_my_data.py your_file.csv --column "text_column_name"
    python test_my_data.py your_file.csv --limit 100
    python test_my_data.py your_file.csv --dedup_scope batch

Examples:
    python test_my_data.py support_tickets.csv
    python test_my_data.py data.csv --column "message" --limit 500
    python test_my_data.py chat_logs.csv --column "text" --limit 1000
    python test_my_data.py gametox.csv --dedup_scope batch  # Each run independent

Dedup Scope Options:
    --dedup_scope batch      : Find duplicates ONLY within this file (DEFAULT)
                               Each run is independent - won't match previous runs.
                               Best for: Testing, one-time analysis, comparing datasets.
    
    --dedup_scope session    : Find duplicates within last 1 hour
                               Useful for: Live game session monitoring.
    
    --dedup_scope daily      : Find duplicates within last 24 hours
                               Useful for: Daily reports, tracking repeat offenders.
    
    --dedup_scope historical : Find duplicates against ALL historical data
                               Database grows forever - learns from every run.
                               Useful for: Building long-term knowledge base.

Output Formats (Enterprise):
    - JSON: Professional wrapper with metadata, versioning, audit trail
    - CSV: Per-query details with PRODUCT1/PRODUCT2/PRODUCT3 columns
    - Excel: Multi-sheet workbook (Summary, Data, Metadata) - Power BI ready
    - Parquet: Columnar format for big data pipelines (5-10x smaller)
    - SQL: Ready-to-execute INSERT statements

Requirements:
    pip install requests openpyxl pyarrow pandas

Support: email us at hiwosy.com
"""

# ================================================================================
# DEPENDENCY CHECK - User-friendly error for missing 'requests' module
# ================================================================================
try:
    import requests
except ImportError:
    print()
    print("=" * 60)
    print("  ERROR: Missing required module 'requests'")
    print("=" * 60)
    print()
    print("  Please install dependencies first:")
    print()
    print("    pip install requests")
    print()
    print("  Or for full features (Excel, Parquet exports):")
    print()
    print("    pip install -r requirements.txt")
    print()
    print("  Then run this script again.")
    print("=" * 60)
    print()
    import sys
    sys.exit(1)
import csv
import json
import sys
import time
import uuid
import os
from datetime import datetime, timezone

# Optional enterprise format dependencies
try:
    import openpyxl
    from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
    from openpyxl.utils.dataframe import dataframe_to_rows
    from openpyxl.chart import BarChart, PieChart, Reference
    EXCEL_AVAILABLE = True
except ImportError:
    EXCEL_AVAILABLE = False
    print("ℹ️  Excel export disabled. Install: pip install openpyxl")

try:
    import pyarrow as pa
    import pyarrow.parquet as pq
    PARQUET_AVAILABLE = True
except ImportError:
    PARQUET_AVAILABLE = False
    print("ℹ️  Parquet export disabled. Install: pip install pyarrow")

try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False

# ═══════════════════════════════════════════════════════════════════════════════
# 🔑 PASTE YOUR API KEY HERE (You received this from Hiwosy)
# ═══════════════════════════════════════════════════════════════════════════════
API_KEY = "hiwosy_demo_public_10k"  # 🎁 Trial key: 10,000 free queries!
API_URL = "https://www.hiwosy.com"
# ═══════════════════════════════════════════════════════════════════════════════

# ═══════════════════════════════════════════════════════════════════════════════
# 📁 DATA FILTER REPORT SYSTEM - Configuration
# ═══════════════════════════════════════════════════════════════════════════════
# Available filter types for separated report files
AVAILABLE_FILTERS = [
    "all",        # 1. All combined results
    "masters",    # 2. Only unique/original queries (MASTER status)
    "duplicates", # 3. Only duplicate queries (MERGE status)
    "ban",        # 4. Queries with BAN action recommended
    "warning",    # 5. Queries with WARNING action
    "safe",       # 6. Clean queries (no toxicity)
    "cache"       # 7. Cache analysis (hits and misses)
]

# Available output formats
AVAILABLE_FORMATS = ["csv", "json", "xlsx", "parquet", "sql"]

# Default: Generate all filters in all formats
DEFAULT_FILTERS = AVAILABLE_FILTERS.copy()
DEFAULT_FORMATS = ["csv", "json", "xlsx", "parquet", "sql"]  # ALL formats by default
# ═══════════════════════════════════════════════════════════════════════════════


def load_csv(filepath, column=None, limit=None):
    """Load queries from CSV file."""
    queries = []
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        # Try to detect if it has headers
        sample = f.read(2048)
        f.seek(0)
        
        try:
            has_header = csv.Sniffer().has_header(sample)
        except:
            has_header = True  # Assume header if detection fails
        
        if has_header:
            reader = csv.DictReader(f)
            columns = reader.fieldnames
            
            if not columns:
                print("   ⚠️ No columns found, treating as single-column file")
                f.seek(0)
                for i, line in enumerate(f):
                    if limit and i >= limit:
                        break
                    line = line.strip()
                    if line:
                        queries.append(line[:500])
                return queries
            
            # Auto-detect text column if not specified
            if not column:
                text_cols = ['query', 'text', 'content', 'message', 'instruction', 
                            'question', 'ticket', 'description', 'body', 'comment',
                            'review', 'feedback', 'input', 'prompt', 'notes']
                for col in text_cols:
                    if col.lower() in [c.lower() for c in columns]:
                        # Find exact match (case insensitive)
                        for c in columns:
                            if c.lower() == col.lower():
                                column = c
                                break
                        break
                if not column:
                    column = columns[0]  # Use first column
            
            print(f"   📋 Available columns: {columns}")
            print(f"   ✅ Using column: '{column}'")
            
            for i, row in enumerate(reader):
                if limit and i >= limit:
                    break
                if column in row and row[column] and row[column].strip():
                    queries.append(row[column].strip()[:500])  # Max 500 chars
        else:
            # No header - treat each line as a query
            for i, line in enumerate(f):
                if limit and i >= limit:
                    break
                line = line.strip()
                if line:
                    queries.append(line[:500])
    
    return queries


def test_api_health():
    """Check if API is accessible."""
    print("\n🔍 Testing API connection...")
    try:
        r = requests.get(f"{API_URL}/api/health", timeout=10)
        if r.status_code == 200:
            data = r.json()
            print(f"   ✅ API is healthy! Version: {data.get('version', 'unknown')}")
            return True
        else:
            print(f"   ❌ API returned status {r.status_code}")
            return False
    except requests.exceptions.ConnectionError:
        print("   ❌ Cannot connect to API. Check your internet connection.")
        return False
    except Exception as e:
        print(f"   ❌ Connection failed: {e}")
        return False


def test_api_key():
    """Verify API key is valid."""
    print("\n🔑 Verifying API key...")
    
    if API_KEY == "YOUR_API_KEY_HERE":
        print("   ❌ Please replace 'YOUR_API_KEY_HERE' with your actual API key!")
        print("   📝 Open this script and edit line 33")
        return False
    
    try:
        r = requests.get(f"{API_URL}/api/usage?api_key={API_KEY}", timeout=10)
        if r.status_code == 200:
            data = r.json()
            print(f"   ✅ API key valid!")
            print(f"   📊 Client: {data.get('client_name', 'Unknown')}")
            print(f"   📊 Plan: {data.get('plan', 'standard')}")
            print(f"   📊 Queries used: {data.get('queries_used', 0)}")
            quota = data.get('quota_remaining', data.get('quota', 'unlimited'))
            print(f"   📊 Quota remaining: {quota}")
            return True
        elif r.status_code == 401:
            print("   ❌ Invalid API key")
            print("   📝 Check that you copied the key correctly")
            return False
        elif r.status_code == 403:
            print("   ❌ API key expired or deactivated")
            print("   📝 Contact support for a new key")
            return False
        else:
            print(f"   ❌ Unexpected response: {r.status_code}")
            return False
    except Exception as e:
        print(f"   ❌ Verification failed: {e}")
        return False


def process_batch(queries, batch_size=1000, dedup_scope="batch"):
    """
    Process queries in batches (500 per batch for faster processing).
    
    Args:
        queries: List of query strings to process
        batch_size: Number of queries per API call (default 500)
        dedup_scope: Deduplication scope - controls what queries are compared against:
            - "batch": Find duplicates within THIS test run (default - independent runs)
                       All batches share the same session, so duplicates are found
                       across the entire file, not just per-batch.
            - "session": Find duplicates within last 1 hour
            - "daily": Find duplicates within last 24 hours  
            - "historical": Find duplicates against ALL historical data
    """
    results = []
    total = len(queries)
    
    # Generate unique session_id for this test run (batch mode)
    # This allows all API calls to share one engine, finding duplicates across entire file
    session_id = f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{id(queries)}"
    
    print(f"\n📤 Processing {total} queries...")
    print(f"   🔍 Dedup scope: {dedup_scope.upper()}")
    if dedup_scope == "batch":
        print(f"   🔗 Session ID: {session_id[:30]}...")
    
    start_time = time.time()
    
    for i in range(0, total, batch_size):
        batch = queries[i:i+batch_size]
        batch_num = (i // batch_size) + 1
        total_batches = (total + batch_size - 1) // batch_size
        
        print(f"   Batch {batch_num}/{total_batches} ({len(batch)} queries)...", end=" ", flush=True)
        
        try:
            r = requests.post(
                f"{API_URL}/api/batch",
                headers={
                    "X-API-Key": API_KEY,
                    "Content-Type": "application/json"
                },
                json={
                    "queries": batch,
                    "dedup_scope": dedup_scope,
                    "session_id": session_id  # All batches share same session
                },
                timeout=120
            )
            
            if r.status_code == 200:
                data = r.json()
                results.extend(data.get("results", []))
                print("✅")
            elif r.status_code == 429:
                print("⏳ Rate limited, waiting...")
                time.sleep(5)
                # Retry
                r = requests.post(
                    f"{API_URL}/api/batch",
                    headers={"X-API-Key": API_KEY, "Content-Type": "application/json"},
                    json={"queries": batch, "dedup_scope": dedup_scope, "session_id": session_id},
                    timeout=120
                )
                if r.status_code == 200:
                    results.extend(r.json().get("results", []))
                    print("✅ (retry)")
            else:
                print(f"❌ Error: {r.status_code}")
                error_msg = r.text[:200] if r.text else "No error message"
                print(f"      {error_msg}")
        except requests.exceptions.Timeout:
            print("⏰ Timeout - batch too large?")
        except Exception as e:
            print(f"❌ Failed: {e}")
        
        # Rate limiting courtesy
        # No rate limiting - full speed
    
    elapsed = time.time() - start_time
    qps = total / elapsed if elapsed > 0 else 0
    print(f"\n   ⏱️ Completed in {elapsed:.1f}s ({qps:.0f} queries/sec)")
    
    return results


def generate_report(queries, results, filepath, output_filters=None, output_formats=None):
    """
    Generate UNIFIED report with all 3 products analysis in multiple enterprise formats.
    
    Args:
        queries: Original list of query strings
        results: API results for each query
        filepath: Source file path
        output_filters: List of filter types to generate (default: all)
        output_formats: List of output formats (default: csv, json, xlsx)
    """
    
    # Start timing for processing stats
    processing_start = time.time()
    
    # Calculate stats
    total = len(results)
    duplicates = sum(1 for r in results if r.get("is_duplicate"))
    unique = total - duplicates
    dedup_rate = (duplicates / total * 100) if total > 0 else 0
    
    # Get confidence stats
    confidences = [r.get("confidence", 0) for r in results if r.get("is_duplicate")]
    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
    
    # UNIFIED STATS (all 3 products)
    unified_stats = {
        # Toxicity (Gaming)
        "toxic_count": sum(1 for r in results if r.get("unified", {}).get("is_toxic", False)),
        "ban_count": sum(1 for r in results if r.get("unified", {}).get("action") == "BAN"),
        "warning_count": sum(1 for r in results if r.get("unified", {}).get("action") == "WARNING"),
        "monitor_count": sum(1 for r in results if r.get("unified", {}).get("action") == "MONITOR"),
        "spam_count": sum(1 for r in results if r.get("unified", {}).get("is_spam", False)),
        "cheat_count": sum(1 for r in results if r.get("unified", {}).get("cheat_mention", False)),
        # Deduplication
        "masters": unique,
        "merges": duplicates,
        # Cache
        "cache_hits": duplicates,
        "cache_misses": unique
    }
    
    # Detect product type from results (for backward compatibility)
    product = "Unknown"
    if results:
        product = results[0].get("product", "Unified API")
    
    # Legacy product-specific stats (for backward compatibility)
    gaming_stats = None
    cache_stats = None
    cleaning_stats = None
    
    if "Gaming" in product:
        gaming_stats = {
            "toxic_messages": unified_stats["toxic_count"],
            "ban_recommended": unified_stats["ban_count"],
            "warnings": unified_stats["warning_count"],
            "monitors": unified_stats["monitor_count"],
            "spam_detected": unified_stats["spam_count"],
            "cheat_mentions": unified_stats["cheat_count"],
            "repeated_msgs": duplicates,
            "clean_messages": total - unified_stats["toxic_count"]
        }
    elif "Cache" in product:
        cache_stats = {
            "cache_hits": duplicates,
            "cache_misses": unique,
            "hit_rate": f"{dedup_rate:.1f}%"
        }
    elif "Cleaning" in product:
        cleaning_stats = {
            "master_records": unique,
            "duplicate_records": duplicates,
            "compression_ratio": f"{(1 - unique/total)*100:.1f}%" if total > 0 else "0%"
        }
    
    print("\n" + "═" * 60)
    print("🧠 HIWOSY™ UNIFIED API REPORT")
    print("   3 Products • 1 API Key • 1 Combined Result")
    print("═" * 60)
    print(f"   📁 File: {filepath}")
    print(f"   📅 Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("-" * 60)
    print(f"   📈 Total queries analyzed: {total:,}")
    print(f"   ✅ Unique queries: {unique:,}")
    print(f"   🔄 Duplicates found: {duplicates:,}")
    print("-" * 60)
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # UNIFIED STATS: ALL 3 PRODUCTS
    # ═══════════════════════════════════════════════════════════════════════════════
    
    print("\n📦 PRODUCT 1: SEMANTIC DEDUPLICATION")
    print(f"   📌 MASTER records: {unified_stats['masters']:,}")
    print(f"   🔗 MERGE records: {unified_stats['merges']:,}")
    print(f"   📊 Dedup rate: {dedup_rate:.1f}%")
    
    print("\n💾 PRODUCT 2: API CACHE")
    print(f"   ⚡ Cache HITs: {unified_stats['cache_hits']:,}")
    print(f"   ❌ Cache MISSes: {unified_stats['cache_misses']:,}")
    
    print("\n🎮 PRODUCT 3: GAMING TOXICITY")
    print(f"   ☠️ Toxic messages: {unified_stats['toxic_count']:,}")
    print(f"   🚫 BAN recommended: {unified_stats['ban_count']:,}")
    print(f"   ⚠️ WARNING recommended: {unified_stats['warning_count']:,}")
    print(f"   👁️ MONITOR recommended: {unified_stats['monitor_count']:,}")
    print(f"   📢 Spam detected: {unified_stats['spam_count']:,}")
    print(f"   🎯 Cheat mentions: {unified_stats['cheat_count']:,}")
    print(f"   ✅ Clean messages: {total - unified_stats['toxic_count']:,}")
    
    # Legacy product-specific report sections (backward compatibility)
    if gaming_stats:
        pass  # Already shown in unified stats above
    toxic_rate = (unified_stats['toxic_count'] / total * 100) if total > 0 else 0
    print(f"\n   📊 Toxicity rate: {toxic_rate:.1f}%")
    print("-" * 60)
    
    print(f"\n   📊 Overall deduplication rate: {dedup_rate:.1f}%")
    if avg_confidence > 0:
        print(f"   🎯 Average match confidence: {avg_confidence:.1%}")
    print("═" * 60)
    
    # Save detailed results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Create organized reports folder structure
    reports_folder = "reports"
    os.makedirs(reports_folder, exist_ok=True)
    
    report_name = os.path.join(reports_folder, f"hiwosy_report_{timestamp}.json")
    
    # Build duplicate pairs for report
    duplicate_examples = []
    query_map = {i: q for i, q in enumerate(queries)}
    
    for r in results:
        if r.get("is_duplicate"):
            duplicate_examples.append({
                "query": r.get("query", ""),
                "matched_to_id": r.get("duplicate_of"),
                "confidence": r.get("confidence", 0),
                "match_type": r.get("match_type", "semantic")
            })
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # ENTERPRISE JSON REPORT - Professional wrapper with audit trail
    # ═══════════════════════════════════════════════════════════════════════════════
    
    # Generate unique run ID for audit trail
    run_id = f"hiwosy_run_{timestamp}_{uuid.uuid4().hex[:8]}"
    processing_end = time.time()
    processing_time = round(processing_end - processing_start, 3)
    
    report = {
        # ═══════════════════════════════════════════════════════════════════════════
        # PROFESSIONAL API WRAPPER (Enterprise Standard)
        # ═══════════════════════════════════════════════════════════════════════════
        "api_version": "2.1.0",
        "run_id": run_id,
        "status": "completed",
        "created_at": datetime.now(timezone.utc).isoformat(),
        
        # Audit & Debugging Info
        "metadata": {
            "source_file": filepath,
            "report_id": timestamp,
            "tool": "HIWOSY™ Unified API",
            "documentation": "https://www.hiwosy.com/docs",
            "api_endpoint": API_URL,
            "generated_by": "test_my_data.py",
            "export_formats": ["json", "csv", "xlsx", "parquet", "sql"]
        },
        
        # ═══════════════════════════════════════════════════════════════════════════
        # EXECUTIVE SUMMARY (Top-level KPIs)
        # ═══════════════════════════════════════════════════════════════════════════
        "summary": {
            "total_queries_analyzed": total,
            "unique_queries": unique,
            "duplicate_queries": duplicates,
            "deduplication_rate_percent": round(dedup_rate, 2),
            "average_match_confidence": round(avg_confidence * 100, 2) if avg_confidence > 0 else 0,
            "processing_stats": {
                "items_processed": total,
                "processing_time_seconds": processing_time
            }
        },
        
        # ═══════════════════════════════════════════════════════════════════════════
        # PRODUCT 1: SEMANTIC DEDUPLICATION
        # Finds semantically similar content, marks as UNIQUE/MASTER or DUPLICATE/MERGE
        # ═══════════════════════════════════════════════════════════════════════════
        "product_1_deduplication": {
            "product_name": "Semantic Deduplication",
            "description": "Identifies semantically similar queries using advanced NLP",
            "statistics": {
                "total_analyzed": total,
                "unique_count": unique,
                "duplicate_count": duplicates,
                "compression_rate_percent": round(dedup_rate, 2),
                "average_similarity_score": round(avg_confidence * 100, 2) if avg_confidence > 0 else 0
            },
            "status_legend": {
                "UNIQUE": "First occurrence, no similar queries found",
                "MASTER": "Original query that duplicates reference",
                "DUPLICATE": "Semantically similar to existing query",
                "MERGE": "Recommended to merge with master query"
            }
        },
        
        # ═══════════════════════════════════════════════════════════════════════════
        # PRODUCT 2: SEMANTIC CACHE
        # Assigns MASTER (cache) or MERGE (use cached) status for API cost savings
        # ═══════════════════════════════════════════════════════════════════════════
        "product_2_cache": {
            "product_name": "Semantic API Cache",
            "description": "Reduces API costs by caching semantically similar queries",
            "statistics": {
                "cache_hits": unified_stats["cache_hits"],
                "cache_misses": unified_stats["cache_misses"],
                "hit_rate_percent": round(unified_stats["cache_hits"] / total * 100, 2) if total > 0 else 0
            },
            "status_legend": {
                "MASTER": "Store response in cache (new unique query)",
                "MERGE": "Use cached response (similar query exists)"
            }
        },
        
        # ═══════════════════════════════════════════════════════════════════════════
        # PRODUCT 3: GAMING BEHAVIOR TOXICITY
        # Detects toxic content, recommends BAN/MONITOR/SAFE actions
        # ═══════════════════════════════════════════════════════════════════════════
        "product_3_toxicity": {
            "product_name": "Gaming Behavior Detection",
            "description": "AI-powered toxicity detection for gaming communities",
            "statistics": {
                "total_analyzed": total,
                "safe_count": total - unified_stats["toxic_count"],
                "toxic_count": unified_stats["toxic_count"],
                "toxicity_rate_percent": round(toxic_rate, 2)
            },
            "actions_breakdown": {
                "immediate_ban": unified_stats["ban_count"],
                "warning": unified_stats["warning_count"],
                "monitor": unified_stats["monitor_count"],
                "none": total - unified_stats["ban_count"] - unified_stats["warning_count"] - unified_stats["monitor_count"]
            },
            "pattern_detection": {
                "spam_detected": unified_stats["spam_count"],
                "cheat_mentions": unified_stats["cheat_count"],
                "repeated_messages": unified_stats.get("repeated_count", 0)
            },
            "toxicity_levels": {
                "SEVERE": "Highly toxic - Immediate action recommended",
                "HIGH": "Significant toxicity - Warning or ban",
                "MODERATE": "Some concerning content - Monitor",
                "MILD": "Minor issues - Review manually",
                "SAFE": "No toxicity detected"
            },
            "action_legend": {
                "BAN": "Recommend immediate ban",
                "WARN": "Issue warning to user",
                "MONITOR": "Flag for monitoring",
                "NONE": "No action required"
            }
        },
        
        # ═══════════════════════════════════════════════════════════════════════════
        # PER-QUERY DETAILED RESULTS (All 3 products per query)
        # ═══════════════════════════════════════════════════════════════════════════
        "data": results,  # Enterprise-standard key name
        
        # Sample duplicates for quick review
        "sample_duplicates": duplicate_examples[:50],
        
        # ═══════════════════════════════════════════════════════════════════════════
        # STATUS & ERROR HANDLING (Enterprise Standard)
        # ═══════════════════════════════════════════════════════════════════════════
        "warnings": [],
        "errors": []
    }
    
    # Add warnings if needed
    if total == 0:
        report["warnings"].append({"code": "NO_DATA", "message": "No queries were processed"})
    if unified_stats["toxic_count"] > total * 0.5:
        report["warnings"].append({"code": "HIGH_TOXICITY", "message": f"More than 50% of content is toxic ({toxic_rate:.1f}%)"})
    if dedup_rate > 80:
        report["warnings"].append({"code": "HIGH_DUPLICATION", "message": f"Very high duplication rate ({dedup_rate:.1f}%)"})
    
    with open(report_name, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)
    
    print(f"\n📁 JSON report saved: {report_name}")
    
    # Build query_id to row_number mapping for readable duplicate references
    query_id_to_row = {}
    for row_num, r in enumerate(results, start=2):  # Start at 2 (row 1 is header)
        qid = r.get("query_id")
        if qid is not None:
            query_id_to_row[qid] = row_num
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # PRODUCTION CSV OUTPUT - All 3 Products Per Row (Detailed)
    # ═══════════════════════════════════════════════════════════════════════════════
    csv_report = os.path.join(reports_folder, f"hiwosy_report_{timestamp}.csv")
    with open(csv_report, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        
        # PRODUCTION HEADERS: Comprehensive columns for all 3 products
        writer.writerow([
            # Identifier
            'Row_ID', 
            'Query_Text',
            
            # ═══ PRODUCT 1: SEMANTIC DEDUPLICATION ═══
            'PRODUCT1_Dedup_Status',        # UNIQUE/DUPLICATE
            'PRODUCT1_Dedup_Action',         # MASTER/MERGE
            'PRODUCT1_Duplicate_Of_Row',     # Which row is the master
            'PRODUCT1_Similarity_Score',     # 0.00-1.00
            'PRODUCT1_Match_Type',           # exact/semantic/fuzzy
            
            # ═══ PRODUCT 2: SEMANTIC CACHE ═══
            'PRODUCT2_Cache_Status',         # MASTER/MERGE
            
            # ═══ PRODUCT 3: GAMING TOXICITY ═══
            'PRODUCT3_Toxicity_Level',       # SAFE/MILD/MODERATE/HIGH/SEVERE
            'PRODUCT3_Toxicity_Score',       # 0-100
            'PRODUCT3_Recommended_Action',   # NONE/MONITOR/WARN/BAN
            'PRODUCT3_Is_Spam',              # YES/NO
            'PRODUCT3_Cheat_Mention',        # YES/NO
            'PRODUCT3_Is_Repeated',          # YES/NO
            
            # Additional context
            'Cluster_ID'               # For grouping related queries
        ])
        
        for row_num, r in enumerate(results, start=2):
            # Get unified data (all 3 products)
            unified = r.get("unified", {})
            
            # Convert query_id to row number for readability
            dup_of_id = r.get("duplicate_of")
            current_query_id = r.get("query_id")
            
            if dup_of_id and dup_of_id == current_query_id:
                dup_of_row = "HISTORICAL"  # Matched a query from previous session
            elif dup_of_id:
                dup_of_row = query_id_to_row.get(dup_of_id, f"ID:{dup_of_id}")
            else:
                dup_of_row = ""
            
            # Determine dedup status and action
            is_dup = r.get("is_duplicate", False)
            dedup_status = "DUPLICATE" if is_dup else "UNIQUE"
            dedup_action = unified.get("dedup_status", "MASTER")  # MASTER or MERGE
            
            # Match type
            match_type = r.get("match_type", "semantic") if is_dup else ""
            
            # Cluster ID
            cluster = r.get("cluster_id", "")
            
            # PRODUCTION ROW: All 3 products with detailed columns
            row = [
                # Identifier
                row_num,
                r.get("query", "")[:200],  # Allow longer text
                
                # Product 1: Deduplication
                dedup_status,
                dedup_action,
                dup_of_row,
                f"{r.get('confidence', 0):.4f}",  # 4 decimal precision
                match_type,
                
                # Product 2: Cache
                unified.get("cache_status", "MASTER"),
                
                # Product 3: Gaming Toxicity
                unified.get("toxicity_level", "SAFE"),
                unified.get("toxicity_score", 0),
                unified.get("action", "NONE").upper(),
                "YES" if unified.get("is_spam") else "NO",
                "YES" if unified.get("cheat_mention") else "NO",
                "YES" if unified.get("is_repeated") else "NO",
                
                # Context
                cluster
            ]
            
            writer.writerow(row)
    
    print(f"📁 CSV report saved: {csv_report} (Per-Query Details)")
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # SUMMARY CSV - Quick stats for executives
    # ═══════════════════════════════════════════════════════════════════════════════
    summary_csv = os.path.join(reports_folder, f"hiwosy_summary_{timestamp}.csv")
    with open(summary_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Metric', 'Value', 'Product'])
        
        # Product 1 stats
        writer.writerow(['Total Queries', total, 'All'])
        writer.writerow(['Unique Queries', unique, 'PRODUCT 1: Deduplication'])
        writer.writerow(['Duplicate Queries', duplicates, 'PRODUCT 1: Deduplication'])
        writer.writerow(['Compression Rate', f"{dedup_rate:.2f}%", 'PRODUCT 1: Deduplication'])
        
        # Product 2 stats
        writer.writerow(['Cache Hits', unified_stats["cache_hits"], 'PRODUCT 2: Cache'])
        writer.writerow(['Cache Misses', unified_stats["cache_misses"], 'PRODUCT 2: Cache'])
        
        # Product 3 stats
        writer.writerow(['Safe Messages', total - unified_stats["toxic_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Toxic Messages', unified_stats["toxic_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Ban Recommended', unified_stats["ban_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Warning Recommended', unified_stats["warning_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Monitor Recommended', unified_stats["monitor_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Spam Detected', unified_stats["spam_count"], 'PRODUCT 3: Toxicity'])
        writer.writerow(['Cheat Mentions', unified_stats["cheat_count"], 'PRODUCT 3: Toxicity'])
    
    print(f"📁 Summary CSV saved: {summary_csv} (Executive Summary)")
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # CONSOLE SUMMARY - Production Level
    # ═══════════════════════════════════════════════════════════════════════════════
    print("\n" + "═" * 70)
    print("📊 HIWOSY™ UNIFIED API RESULTS - 3 PRODUCTS")
    print("═" * 70)
    
    # Product 1 Summary
    print("\n🔍 PRODUCT 1: SEMANTIC DEDUPLICATION")
    print(f"   ├─ UNIQUE (MASTER): {unique:,} queries")
    print(f"   ├─ DUPLICATE (MERGE): {duplicates:,} queries")
    print(f"   └─ Compression Rate: {dedup_rate:.1f}%")
    
    # Product 2 Summary
    print("\n💾 PRODUCT 2: SEMANTIC CACHE")
    print(f"   ├─ Cache HITS: {unified_stats['cache_hits']:,}")
    print(f"   └─ Cache MISSES: {unified_stats['cache_misses']:,}")
    
    # Product 3 Summary
    print("\n🎮 PRODUCT 3: GAMING BEHAVIOR TOXICITY")
    print(f"   ├─ SAFE: {total - unified_stats['toxic_count']:,} messages")
    print(f"   ├─ TOXIC: {unified_stats['toxic_count']:,} messages ({toxic_rate:.1f}%)")
    print(f"   ├─ Actions: BAN={unified_stats['ban_count']:,} | WARN={unified_stats['warning_count']:,} | MONITOR={unified_stats['monitor_count']:,}")
    print(f"   └─ Patterns: Spam={unified_stats['spam_count']:,} | Cheat={unified_stats['cheat_count']:,}")
    
    print("\n" + "─" * 70)
    
    # Sample toxic messages
    if unified_stats['toxic_count'] > 0:
        print("\n🚨 SAMPLE TOXIC MESSAGES (Top 5):")
        shown = 0
        for r in results:
            unified = r.get("unified", {})
            if unified.get("is_toxic") and shown < 5:
                query_text = r.get('query', '')[:50]
                level = unified.get("toxicity_level", "?")
                action = unified.get("action", "?").upper()
                print(f"   ⚠️ \"{query_text}...\"")
                print(f"      └─ {level} → {action}")
                shown += 1
        if unified_stats['toxic_count'] > 5:
            print(f"   ... and {unified_stats['toxic_count'] - 5} more")
    
    # Sample duplicates
    if duplicates > 0:
        print("\n🔗 SAMPLE DUPLICATES (Top 5 MERGE candidates):")
        shown = 0
        for r in results:
            if r.get("is_duplicate") and shown < 5:
                query_text = r.get('query', '')[:50]
                conf = r.get('confidence', 0)
                print(f"   🔗 \"{query_text}...\"")
                print(f"      └─ Similarity: {conf:.0%}")
                shown += 1
        if duplicates > 5:
            print(f"   ... and {duplicates - 5} more")
    
    # Generate HTML report (SUMMARY STATS ONLY - no per-query data)
    html_report = os.path.join(reports_folder, f"hiwosy_report_{timestamp}.html")
    html_content = generate_html_report(
        filepath=filepath,
        total=total,
        unique=unique,
        duplicates=duplicates,
        gaming_stats=gaming_stats if gaming_stats else {},
        results=results,
        timestamp=timestamp,
        unified_stats=unified_stats
    )
    with open(html_report, 'w', encoding='utf-8') as f:
        f.write(html_content)
    print(f"📁 HTML report saved: {html_report} (Summary Statistics)")
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # ENTERPRISE EXPORTS - Phase 1 & Phase 2
    # ═══════════════════════════════════════════════════════════════════════════════
    print("\n" + "─" * 50)
    print("📦 ENTERPRISE EXPORTS")
    print("─" * 50)
    
    # Smart Excel (.xlsx) - Multi-sheet workbook
    excel_report = generate_excel_report(
        filepath=filepath,
        results=results,
        unified_stats=unified_stats,
        timestamp=timestamp,
        total=total,
        unique=unique,
        duplicates=duplicates,
        dedup_rate=dedup_rate,
        toxic_rate=toxic_rate
    )
    if excel_report:
        print(f"📁 Excel report saved: {excel_report} (Multi-sheet, Power BI ready)")
    
    # Parquet (.parquet) - Columnar format for big data
    parquet_report = generate_parquet_output(results, timestamp)
    if parquet_report:
        print(f"📁 Parquet saved: {parquet_report} (5-10x smaller, BigQuery/Spark ready)")
    
    # SQL INSERT statements
    sql_report = generate_sql_output(results, timestamp)
    if sql_report:
        print(f"📁 SQL saved: {sql_report} (Ready-to-execute INSERT statements)")
    
    # Power BI configuration
    pbi_report = generate_powerbi_template(
        filepath=filepath,
        timestamp=timestamp,
        total=total,
        unified_stats=unified_stats,
        dedup_rate=dedup_rate,
        toxic_rate=toxic_rate
    )
    if pbi_report:
        print(f"📁 Power BI config saved: {pbi_report} (Dashboard configuration)")
    
    # ═══════════════════════════════════════════════════════════════════════════════
    # DATA FILTER REPORT SYSTEM - Generate separated files per category
    # ═══════════════════════════════════════════════════════════════════════════════
    filtered_files = generate_filtered_reports(
        results=results,
        timestamp=timestamp,
        output_filters=output_filters,
        output_formats=output_formats
    )
    
    # Return all file names for final summary
    return {
        "json": report_name,
        "csv_detail": csv_report,
        "csv_summary": summary_csv,
        "html": html_report,
        "excel": excel_report,
        "parquet": parquet_report,
        "sql": sql_report,
        "powerbi": pbi_report,
        "filtered": filtered_files  # NEW: All filtered report files
    }


def generate_html_report(filepath, total, unique, duplicates, gaming_stats, results, timestamp, unified_stats=None):
    """
    Generate PRODUCTION-LEVEL HTML report with SUMMARY STATISTICS ONLY.
    
    This report shows aggregated statistics for all 3 products:
    - Product 1: Semantic Deduplication
    - Product 2: API Cache  
    - Product 3: Gaming Behavior Toxicity
    
    NOTE: Per-query details are in CSV and JSON files (not HTML).
    """
    
    # Use unified_stats if provided, otherwise fall back to gaming_stats
    if unified_stats is None:
        unified_stats = {
            "masters": unique,
            "merges": duplicates,
            "cache_hits": duplicates,
            "cache_misses": unique,
            "toxic_count": gaming_stats.get('toxic_messages', 0),
            "ban_count": gaming_stats.get('ban_recommended', 0),
            "warning_count": gaming_stats.get('warnings', 0),
            "monitor_count": gaming_stats.get('monitors', 0),
            "spam_count": gaming_stats.get('spam_detected', 0),
            "cheat_count": gaming_stats.get('cheat_mentions', 0),
            "repeated_count": gaming_stats.get('repeated_msgs', 0)
        }
    
    toxic_rate = (unified_stats['toxic_count'] / total * 100) if total > 0 else 0
    safe_rate = 100 - toxic_rate
    dedup_rate = (duplicates / total * 100) if total > 0 else 0
    cache_hit_rate = (unified_stats['cache_hits'] / total * 100) if total > 0 else 0
    
    
    return f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>HIWOSY™ Unified API Report - 3 Products</title>
    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700;800&display=swap" rel="stylesheet">
    <style>
        :root {{
            --primary: #f97316;
            --primary-glow: rgba(249, 115, 22, 0.3);
            --bg-dark: #0a0a0f;
            --bg-card: #12121a;
            --bg-card-hover: #1a1a24;
            --text: #f0f0f5;
            --text-muted: #8888aa;
            --success: #00d47e;
            --warning: #ffb020;
            --danger: #ff4757;
            --info: #4da6ff;
            --purple: #a855f7;
            --cyan: #06b6d4;
        }}
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: 'Outfit', system-ui, sans-serif;
            background: var(--bg-dark);
            background-image: 
                radial-gradient(ellipse at 20% 0%, rgba(249, 115, 22, 0.08) 0%, transparent 50%),
                radial-gradient(ellipse at 80% 100%, rgba(168, 85, 247, 0.06) 0%, transparent 50%);
            color: var(--text);
            padding: 40px;
            line-height: 1.6;
            min-height: 100vh;
        }}
        .container {{ max-width: 1400px; margin: 0 auto; }}
        
        /* Header */
        .header {{
            text-align: center;
            margin-bottom: 50px;
            padding: 40px;
            background: linear-gradient(135deg, rgba(249, 115, 22, 0.1), rgba(168, 85, 247, 0.05));
            border-radius: 24px;
            border: 1px solid rgba(249, 115, 22, 0.2);
        }}
        h1 {{
            font-size: 3rem;
            font-weight: 800;
            background: linear-gradient(135deg, var(--primary), #fb923c, var(--purple));
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
            margin-bottom: 8px;
            letter-spacing: -1px;
        }}
        .tagline {{
            font-size: 1.1rem;
            color: var(--text-muted);
            margin-bottom: 16px;
        }}
        .meta-info {{
            display: flex;
            justify-content: center;
            gap: 30px;
            flex-wrap: wrap;
            color: var(--text-muted);
            font-size: 0.9rem;
        }}
        .meta-item {{ display: flex; align-items: center; gap: 6px; }}
        
        /* Product Sections */
        .products-grid {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 24px;
            margin-bottom: 40px;
        }}
        @media (max-width: 1100px) {{ .products-grid {{ grid-template-columns: 1fr; }} }}
        
        .product-card {{
            background: var(--bg-card);
            border-radius: 20px;
            padding: 28px;
            border: 1px solid rgba(255,255,255,0.08);
            transition: all 0.3s ease;
        }}
        .product-card:hover {{
            transform: translateY(-4px);
            box-shadow: 0 20px 40px rgba(0,0,0,0.3);
        }}
        .product-card.dedup {{ border-top: 3px solid var(--cyan); }}
        .product-card.cache {{ border-top: 3px solid var(--purple); }}
        .product-card.toxicity {{ border-top: 3px solid var(--danger); }}
        
        .product-header {{
            display: flex;
            align-items: center;
            gap: 12px;
            margin-bottom: 24px;
        }}
        .product-icon {{
            width: 48px;
            height: 48px;
            border-radius: 12px;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 1.5rem;
        }}
        .product-icon.dedup {{ background: rgba(6, 182, 212, 0.2); }}
        .product-icon.cache {{ background: rgba(168, 85, 247, 0.2); }}
        .product-icon.toxicity {{ background: rgba(255, 71, 87, 0.2); }}
        
        .product-title {{
            font-size: 1.3rem;
            font-weight: 700;
        }}
        .product-subtitle {{
            font-size: 0.75rem;
            color: var(--text-muted);
            text-transform: uppercase;
            letter-spacing: 1px;
        }}
        
        .metric-grid {{
            display: grid;
            grid-template-columns: repeat(2, 1fr);
            gap: 16px;
        }}
        .metric {{
            background: rgba(255,255,255,0.03);
            border-radius: 12px;
            padding: 16px;
            text-align: center;
        }}
        .metric-value {{
            font-size: 2rem;
            font-weight: 700;
            margin-bottom: 4px;
        }}
        .metric-value.cyan {{ color: var(--cyan); }}
        .metric-value.purple {{ color: var(--purple); }}
        .metric-value.success {{ color: var(--success); }}
        .metric-value.danger {{ color: var(--danger); }}
        .metric-value.warning {{ color: var(--warning); }}
        .metric-value.info {{ color: var(--info); }}
        .metric-label {{
            font-size: 0.7rem;
            color: var(--text-muted);
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }}
        
        .big-metric {{
            grid-column: span 2;
            background: linear-gradient(135deg, rgba(255,255,255,0.05), rgba(255,255,255,0.02));
            border: 1px solid rgba(255,255,255,0.1);
        }}
        .big-metric .metric-value {{ font-size: 2.8rem; }}
        
        /* Summary Bar */
        .summary-bar {{
            display: grid;
            grid-template-columns: repeat(4, 1fr);
            gap: 20px;
            margin-bottom: 40px;
        }}
        @media (max-width: 800px) {{ .summary-bar {{ grid-template-columns: repeat(2, 1fr); }} }}
        
        .summary-card {{
            background: var(--bg-card);
            border-radius: 16px;
            padding: 24px;
            text-align: center;
            border: 1px solid rgba(255,255,255,0.08);
        }}
        .summary-value {{
            font-size: 2.5rem;
            font-weight: 800;
            background: linear-gradient(135deg, var(--primary), #fb923c);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }}
        .summary-label {{
            font-size: 0.8rem;
            color: var(--text-muted);
            text-transform: uppercase;
            letter-spacing: 0.5px;
            margin-top: 4px;
        }}
        
        /* Visual Bar */
        .visual-bar {{
            margin: 30px 0;
        }}
        .bar-title {{
            font-size: 0.8rem;
            color: var(--text-muted);
            text-transform: uppercase;
            letter-spacing: 1px;
            margin-bottom: 12px;
        }}
        .bar-container {{
            height: 32px;
            border-radius: 16px;
            overflow: hidden;
            display: flex;
            background: rgba(255,255,255,0.05);
        }}
        .bar-segment {{
            height: 100%;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 0.7rem;
            font-weight: 600;
            color: white;
            transition: width 0.5s ease;
        }}
        .bar-segment.safe {{ background: var(--success); }}
        .bar-segment.warning {{ background: var(--warning); }}
        .bar-segment.danger {{ background: var(--danger); }}
        .bar-segment.unique {{ background: var(--cyan); }}
        .bar-segment.duplicate {{ background: var(--purple); }}
        
        /* File Info */
        .file-info {{
            background: var(--bg-card);
            border-radius: 16px;
            padding: 24px;
            margin-bottom: 30px;
            border: 1px solid rgba(255,255,255,0.08);
        }}
        .file-info-title {{
            font-size: 0.75rem;
            color: var(--text-muted);
            text-transform: uppercase;
            letter-spacing: 1px;
            margin-bottom: 12px;
        }}
        .file-info-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 16px;
        }}
        .file-info-item {{
            display: flex;
            align-items: center;
            gap: 8px;
        }}
        .file-info-item span:first-child {{ color: var(--text-muted); }}
        
        /* Footer */
        .footer {{
            text-align: center;
            margin-top: 60px;
            padding-top: 30px;
            border-top: 1px solid rgba(255,255,255,0.08);
            color: var(--text-muted);
            font-size: 0.85rem;
        }}
        .footer a {{ color: var(--primary); text-decoration: none; }}
        .footer a:hover {{ text-decoration: underline; }}
        
        /* Note */
        .note {{
            background: rgba(249, 115, 22, 0.1);
            border: 1px solid rgba(249, 115, 22, 0.3);
            border-radius: 12px;
            padding: 16px 20px;
            margin-top: 30px;
            font-size: 0.9rem;
            color: var(--text-muted);
        }}
        .note strong {{ color: var(--primary); }}
    </style>
</head>
<body>
    <div class="container">
        <!-- Header -->
        <div class="header">
            <h1>🧠 HIWOSY™ UNIFIED API</h1>
            <p class="tagline">3 Products • 1 API Key • Comprehensive Analysis</p>
            <div class="meta-info">
                <div class="meta-item">📁 {filepath}</div>
                <div class="meta-item">📅 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
                <div class="meta-item">📊 {total:,} queries analyzed</div>
            </div>
        </div>
        
        <!-- Quick Summary -->
        <div class="summary-bar">
            <div class="summary-card">
                <div class="summary-value">{total:,}</div>
                <div class="summary-label">Total Queries</div>
            </div>
            <div class="summary-card">
                <div class="summary-value">{dedup_rate:.1f}%</div>
                <div class="summary-label">Dedup Rate</div>
            </div>
            <div class="summary-card">
                <div class="summary-value">{cache_hit_rate:.1f}%</div>
                <div class="summary-label">Cache Hit Rate</div>
            </div>
            <div class="summary-card">
                <div class="summary-value">{toxic_rate:.1f}%</div>
                <div class="summary-label">Toxicity Rate</div>
            </div>
        </div>
        
        <!-- 3 Products Grid -->
        <div class="products-grid">
            <!-- Product 1: Semantic Deduplication -->
            <div class="product-card dedup">
                <div class="product-header">
                    <div class="product-icon dedup">🔍</div>
                    <div>
                        <div class="product-title">Semantic Deduplication</div>
                        <div class="product-subtitle">Product 1</div>
                    </div>
                </div>
                <div class="metric-grid">
                    <div class="metric big-metric">
                        <div class="metric-value cyan">{dedup_rate:.1f}%</div>
                        <div class="metric-label">Compression Rate</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value success">{unified_stats['masters']:,}</div>
                        <div class="metric-label">MASTER (Unique)</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value purple">{unified_stats['merges']:,}</div>
                        <div class="metric-label">MERGE (Duplicate)</div>
                    </div>
                </div>
                <div class="visual-bar">
                    <div class="bar-title">Unique vs Duplicate</div>
                    <div class="bar-container">
                        <div class="bar-segment unique" style="width: {100 - dedup_rate}%">{100 - dedup_rate:.0f}%</div>
                        <div class="bar-segment duplicate" style="width: {dedup_rate}%">{dedup_rate:.0f}%</div>
                    </div>
                </div>
            </div>
            
            <!-- Product 2: API Cache -->
            <div class="product-card cache">
                <div class="product-header">
                    <div class="product-icon cache">💾</div>
                    <div>
                        <div class="product-title">Semantic Cache</div>
                        <div class="product-subtitle">Product 2</div>
                    </div>
                </div>
                <div class="metric-grid">
                    <div class="metric big-metric">
                        <div class="metric-value purple">{cache_hit_rate:.1f}%</div>
                        <div class="metric-label">Cache Hit Rate</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value success">{unified_stats['cache_hits']:,}</div>
                        <div class="metric-label">Cache HITS</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value info">{unified_stats['cache_misses']:,}</div>
                        <div class="metric-label">Cache MISS</div>
                    </div>
                </div>
                <div class="visual-bar">
                    <div class="bar-title">Cache Hit Rate</div>
                    <div class="bar-container">
                        <div class="bar-segment safe" style="width: {cache_hit_rate}%">{cache_hit_rate:.0f}% HIT</div>
                        <div class="bar-segment warning" style="width: {100 - cache_hit_rate}%">{100 - cache_hit_rate:.0f}% MISS</div>
                    </div>
                </div>
            </div>
            
            <!-- Product 3: Gaming Toxicity -->
            <div class="product-card toxicity">
                <div class="product-header">
                    <div class="product-icon toxicity">🎮</div>
                    <div>
                        <div class="product-title">Gaming Behavior</div>
                        <div class="product-subtitle">Product 3</div>
                    </div>
                </div>
                <div class="metric-grid">
                    <div class="metric">
                        <div class="metric-value success">{total - unified_stats['toxic_count']:,}</div>
                        <div class="metric-label">SAFE</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value danger">{unified_stats['toxic_count']:,}</div>
                        <div class="metric-label">TOXIC</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value danger">{unified_stats['ban_count']:,}</div>
                        <div class="metric-label">🚫 BAN</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value warning">{unified_stats['warning_count']:,}</div>
                        <div class="metric-label">⚠️ WARN</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value info">{unified_stats['monitor_count']:,}</div>
                        <div class="metric-label">👁️ MONITOR</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value purple">{unified_stats.get('spam_count', 0):,}</div>
                        <div class="metric-label">📢 SPAM</div>
                    </div>
                </div>
                <div class="visual-bar">
                    <div class="bar-title">Toxicity Distribution</div>
                    <div class="bar-container">
                        <div class="bar-segment safe" style="width: {safe_rate}%">{safe_rate:.0f}%</div>
                        <div class="bar-segment danger" style="width: {toxic_rate}%">{toxic_rate:.0f}%</div>
                    </div>
                </div>
            </div>
        </div>
        
        <!-- File Details -->
        <div class="file-info">
            <div class="file-info-title">📋 Report Details</div>
            <div class="file-info-grid">
                <div class="file-info-item">
                    <span>File:</span>
                    <strong>{filepath}</strong>
                </div>
                <div class="file-info-item">
                    <span>Generated:</span>
                    <strong>{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</strong>
                </div>
                <div class="file-info-item">
                    <span>API Version:</span>
                    <strong>2.0.0</strong>
                </div>
                <div class="file-info-item">
                    <span>Report ID:</span>
                    <strong>{timestamp}</strong>
                </div>
            </div>
        </div>
        
        <div class="note">
            <strong>📝 Note:</strong> This HTML report shows <strong>summary statistics only</strong>. 
            For detailed per-query results, please refer to the accompanying <strong>CSV</strong> and <strong>JSON</strong> files.
            Each row in those files contains the full analysis for all 3 products.
        </div>
        
        <!-- Footer -->
        <div class="footer">
            <p>© 2025 HIWOSY™ - Unified Semantic Intelligence API</p>
            <p>
                <a href="https://www.hiwosy.com">www.hiwosy.com</a> | 
                Semantic Deduplication • API Cache • Gaming Behavior
            </p>
        </div>
    </div>
</body>
</html>'''


# ═══════════════════════════════════════════════════════════════════════════════
# ENTERPRISE EXPORT FUNCTIONS - Phase 1 & Phase 2
# ═══════════════════════════════════════════════════════════════════════════════

def generate_excel_report(filepath, results, unified_stats, timestamp, total, unique, duplicates, dedup_rate, toxic_rate):
    """
    Generate SMART EXCEL (.xlsx) with multiple sheets.
    
    Structure:
    ├─ Executive Summary (KPIs, charts)
    ├─ Product 1: Deduplication (data table)
    ├─ Product 2: Cache (data table)
    ├─ Product 3: Toxicity (data table)
    ├─ All Data (full per-query results)
    └─ Metadata (run info, thresholds)
    
    Power BI and Excel can consume this directly with zero manual work.
    """
    if not EXCEL_AVAILABLE:
        print("   ⚠️ Excel export skipped (install: pip install openpyxl)")
        return None
    
    excel_file = os.path.join("reports", f"hiwosy_report_{timestamp}.xlsx")
    wb = openpyxl.Workbook()
    
    # Styles
    header_font = Font(bold=True, color="FFFFFF", size=11)
    header_fill = PatternFill(start_color="F97316", end_color="F97316", fill_type="solid")
    kpi_font = Font(bold=True, size=24, color="F97316")
    label_font = Font(size=10, color="666666")
    thin_border = Border(
        left=Side(style='thin', color='DDDDDD'),
        right=Side(style='thin', color='DDDDDD'),
        top=Side(style='thin', color='DDDDDD'),
        bottom=Side(style='thin', color='DDDDDD')
    )
    
    # ═══════════════════════════════════════════════════════════════════════════
    # SHEET 1: Executive Summary
    # ═══════════════════════════════════════════════════════════════════════════
    ws_summary = wb.active
    ws_summary.title = "Executive Summary"
    
    # Title
    ws_summary['A1'] = "🧠 HIWOSY™ UNIFIED API REPORT"
    ws_summary['A1'].font = Font(bold=True, size=20, color="F97316")
    ws_summary.merge_cells('A1:D1')
    
    ws_summary['A2'] = "3 Products • 1 API Key • Enterprise Intelligence"
    ws_summary['A2'].font = Font(size=12, color="888888")
    ws_summary.merge_cells('A2:D2')
    
    # KPI Cards
    kpis = [
        ("Total Queries", total),
        ("Dedup Rate", f"{dedup_rate:.1f}%"),
        ("Cache Hit Rate", f"{unified_stats['cache_hits']/total*100:.1f}%" if total > 0 else "0%"),
        ("Toxicity Rate", f"{toxic_rate:.1f}%"),
    ]
    
    for col, (label, value) in enumerate(kpis, start=1):
        ws_summary.cell(row=4, column=col, value=str(value)).font = kpi_font
        ws_summary.cell(row=5, column=col, value=label).font = label_font
    
    # Product 1 Stats
    ws_summary['A7'] = "📦 PRODUCT 1: SEMANTIC DEDUPLICATION"
    ws_summary['A7'].font = Font(bold=True, size=14)
    ws_summary['A8'] = "MASTER (Unique)"
    ws_summary['B8'] = unified_stats['masters']
    ws_summary['A9'] = "MERGE (Duplicate)"
    ws_summary['B9'] = unified_stats['merges']
    ws_summary['A10'] = "Compression Rate"
    ws_summary['B10'] = f"{dedup_rate:.2f}%"
    
    # Product 2 Stats
    ws_summary['A12'] = "💾 PRODUCT 2: SEMANTIC CACHE"
    ws_summary['A12'].font = Font(bold=True, size=14)
    ws_summary['A13'] = "Cache HITs"
    ws_summary['B13'] = unified_stats['cache_hits']
    ws_summary['A14'] = "Cache MISSes"
    ws_summary['B14'] = unified_stats['cache_misses']
    
    # Product 3 Stats
    ws_summary['A16'] = "🎮 PRODUCT 3: GAMING BEHAVIOR"
    ws_summary['A16'].font = Font(bold=True, size=14)
    ws_summary['A17'] = "Safe Messages"
    ws_summary['B17'] = total - unified_stats['toxic_count']
    ws_summary['A18'] = "Toxic Messages"
    ws_summary['B18'] = unified_stats['toxic_count']
    ws_summary['A19'] = "BAN Recommended"
    ws_summary['B19'] = unified_stats['ban_count']
    ws_summary['A20'] = "WARNING Recommended"
    ws_summary['B20'] = unified_stats['warning_count']
    ws_summary['A21'] = "MONITOR Recommended"
    ws_summary['B21'] = unified_stats['monitor_count']
    ws_summary['A22'] = "Spam Detected"
    ws_summary['B22'] = unified_stats['spam_count']
    
    # Adjust column widths
    ws_summary.column_dimensions['A'].width = 25
    ws_summary.column_dimensions['B'].width = 15
    ws_summary.column_dimensions['C'].width = 15
    ws_summary.column_dimensions['D'].width = 15
    
    # ═══════════════════════════════════════════════════════════════════════════
    # SHEET 2: All Data (Full Per-Query Results)
    # ═══════════════════════════════════════════════════════════════════════════
    ws_data = wb.create_sheet("All Data")
    
    # Headers
    headers = [
        'Row_ID', 'Query_Text',
        'PRODUCT1_Status', 'PRODUCT1_Action', 'PRODUCT1_Similarity',
        'PRODUCT2_Cache_Status',
        'PRODUCT3_Toxicity_Level', 'PRODUCT3_Score', 'PRODUCT3_Action',
        'PRODUCT3_Is_Spam', 'PRODUCT3_Cheat_Mention'
    ]
    
    for col, header in enumerate(headers, start=1):
        cell = ws_data.cell(row=1, column=col, value=header)
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center')
    
    # Data rows
    for row_num, r in enumerate(results, start=2):
        unified = r.get("unified", {})
        is_dup = r.get("is_duplicate", False)
        
        ws_data.cell(row=row_num, column=1, value=row_num - 1)
        ws_data.cell(row=row_num, column=2, value=r.get("query", "")[:200])
        ws_data.cell(row=row_num, column=3, value="DUPLICATE" if is_dup else "UNIQUE")
        ws_data.cell(row=row_num, column=4, value=unified.get("dedup_status", "MASTER"))
        ws_data.cell(row=row_num, column=5, value=round(r.get("confidence", 0), 4))
        ws_data.cell(row=row_num, column=6, value=unified.get("cache_status", "MASTER"))
        ws_data.cell(row=row_num, column=7, value=unified.get("toxicity_level", "SAFE"))
        ws_data.cell(row=row_num, column=8, value=unified.get("toxicity_score", 0))
        ws_data.cell(row=row_num, column=9, value=unified.get("action", "NONE").upper())
        ws_data.cell(row=row_num, column=10, value="YES" if unified.get("is_spam") else "NO")
        ws_data.cell(row=row_num, column=11, value="YES" if unified.get("cheat_mention") else "NO")
    
    # Auto-filter for Power BI
    ws_data.auto_filter.ref = f"A1:K{len(results) + 1}"
    
    # Adjust column widths
    ws_data.column_dimensions['A'].width = 8
    ws_data.column_dimensions['B'].width = 50
    for col in ['C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']:
        ws_data.column_dimensions[col].width = 18
    
    # ═══════════════════════════════════════════════════════════════════════════
    # SHEET 3: Metadata
    # ═══════════════════════════════════════════════════════════════════════════
    ws_meta = wb.create_sheet("Metadata")
    
    metadata_rows = [
        ("Report ID", timestamp),
        ("Generated At", datetime.now().isoformat()),
        ("Source File", filepath),
        ("Total Queries", total),
        ("API Version", "2.1.0"),
        ("Tool", "HIWOSY™ Unified API"),
        ("Documentation", "https://www.hiwosy.com/docs"),
    ]
    
    ws_meta['A1'] = "⚙️ REPORT METADATA"
    ws_meta['A1'].font = Font(bold=True, size=14)
    ws_meta.merge_cells('A1:B1')
    
    for row, (key, value) in enumerate(metadata_rows, start=3):
        ws_meta.cell(row=row, column=1, value=key).font = Font(bold=True)
        ws_meta.cell(row=row, column=2, value=str(value))
    
    ws_meta.column_dimensions['A'].width = 20
    ws_meta.column_dimensions['B'].width = 50
    
    # ═══════════════════════════════════════════════════════════════════════════
    # SHEET 4: Legend
    # ═══════════════════════════════════════════════════════════════════════════
    ws_legend = wb.create_sheet("Legend")
    
    ws_legend['A1'] = "📖 COLUMN LEGEND"
    ws_legend['A1'].font = Font(bold=True, size=14)
    
    legends = [
        ("PRODUCT1_Status", "UNIQUE = First occurrence | DUPLICATE = Similar exists"),
        ("PRODUCT1_Action", "MASTER = Keep as original | MERGE = Merge with master"),
        ("PRODUCT2_Cache_Status", "MASTER = Cache response | MERGE = Use cached response"),
        ("PRODUCT3_Toxicity_Level", "SAFE | MILD | MODERATE | HIGH | SEVERE"),
        ("PRODUCT3_Action", "NONE | MONITOR | WARN | BAN"),
    ]
    
    for row, (col_name, description) in enumerate(legends, start=3):
        ws_legend.cell(row=row, column=1, value=col_name).font = Font(bold=True)
        ws_legend.cell(row=row, column=2, value=description)
    
    ws_legend.column_dimensions['A'].width = 25
    ws_legend.column_dimensions['B'].width = 60
    
    # Save
    wb.save(excel_file)
    return excel_file


def generate_parquet_output(results, timestamp):
    """
    Generate Apache Parquet (.parquet) file for big data pipelines.
    
    Benefits:
    - 5-10x smaller than CSV
    - Faster load in Power BI, Spark, BigQuery
    - Columnar format (efficient for analytics)
    
    Enterprise & data teams LOVE Parquet.
    """
    if not PARQUET_AVAILABLE:
        print("   ⚠️ Parquet export skipped (install: pip install pyarrow)")
        return None
    
    parquet_file = os.path.join("reports", f"hiwosy_report_{timestamp}.parquet")
    
    # Build data for Parquet
    data = {
        "row_id": [],
        "query_text": [],
        "product1_status": [],
        "product1_action": [],
        "product1_similarity": [],
        "product2_cache_status": [],
        "product3_toxicity_level": [],
        "product3_score": [],
        "product3_action": [],
        "product3_is_spam": [],
        "product3_cheat_mention": []
    }
    
    for row_num, r in enumerate(results, start=1):
        unified = r.get("unified", {})
        is_dup = r.get("is_duplicate", False)
        
        data["row_id"].append(row_num)
        data["query_text"].append(r.get("query", "")[:500])
        data["product1_status"].append("DUPLICATE" if is_dup else "UNIQUE")
        data["product1_action"].append(unified.get("dedup_status", "MASTER"))
        data["product1_similarity"].append(round(r.get("confidence", 0), 4))
        data["product2_cache_status"].append(unified.get("cache_status", "MASTER"))
        data["product3_toxicity_level"].append(unified.get("toxicity_level", "SAFE"))
        data["product3_score"].append(unified.get("toxicity_score", 0))
        data["product3_action"].append(unified.get("action", "NONE").upper())
        data["product3_is_spam"].append(unified.get("is_spam", False))
        data["product3_cheat_mention"].append(unified.get("cheat_mention", False))
    
    # Create PyArrow table
    table = pa.table(data)
    
    # Write to Parquet
    pq.write_table(table, parquet_file, compression='snappy')
    
    return parquet_file


def generate_sql_output(results, timestamp, table_name="hiwosy_results"):
    """
    Generate SQL-ready INSERT statements.
    
    Client can run directly in PostgreSQL/MySQL/SQLite:
    - No mapping needed
    - No parsing needed
    - No errors
    
    Also generates a CREATE TABLE statement.
    """
    sql_file = os.path.join("reports", f"hiwosy_report_{timestamp}.sql")
    
    with open(sql_file, 'w', encoding='utf-8') as f:
        # Header comment
        f.write(f"-- HIWOSY™ Unified API Results\n")
        f.write(f"-- Generated: {datetime.now().isoformat()}\n")
        f.write(f"-- Total Records: {len(results)}\n")
        f.write(f"-- ═══════════════════════════════════════════════════════════════\n\n")
        
        # CREATE TABLE statement
        f.write(f"-- Create table (PostgreSQL/MySQL compatible)\n")
        f.write(f"CREATE TABLE IF NOT EXISTS {table_name} (\n")
        f.write(f"    id SERIAL PRIMARY KEY,\n")
        f.write(f"    query_text TEXT,\n")
        f.write(f"    product1_status VARCHAR(20),\n")
        f.write(f"    product1_action VARCHAR(20),\n")
        f.write(f"    product1_similarity DECIMAL(5,4),\n")
        f.write(f"    product2_cache_status VARCHAR(20),\n")
        f.write(f"    product3_toxicity_level VARCHAR(20),\n")
        f.write(f"    product3_score INTEGER,\n")
        f.write(f"    product3_action VARCHAR(20),\n")
        f.write(f"    product3_is_spam BOOLEAN,\n")
        f.write(f"    product3_cheat_mention BOOLEAN,\n")
        f.write(f"    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n")
        f.write(f");\n\n")
        
        # INSERT statements
        f.write(f"-- Insert data\n")
        f.write(f"INSERT INTO {table_name} (\n")
        f.write(f"    query_text, product1_status, product1_action, product1_similarity,\n")
        f.write(f"    product2_cache_status, product3_toxicity_level, product3_score,\n")
        f.write(f"    product3_action, product3_is_spam, product3_cheat_mention\n")
        f.write(f") VALUES\n")
        
        values = []
        for r in results:
            unified = r.get("unified", {})
            is_dup = r.get("is_duplicate", False)
            
            # Escape single quotes in text
            query_text = r.get("query", "").replace("'", "''")[:500]
            
            row = (
                f"('{query_text}', "
                f"'{'DUPLICATE' if is_dup else 'UNIQUE'}', "
                f"'{unified.get('dedup_status', 'MASTER')}', "
                f"{round(r.get('confidence', 0), 4)}, "
                f"'{unified.get('cache_status', 'MASTER')}', "
                f"'{unified.get('toxicity_level', 'SAFE')}', "
                f"{unified.get('toxicity_score', 0)}, "
                f"'{unified.get('action', 'NONE').upper()}', "
                f"{'TRUE' if unified.get('is_spam') else 'FALSE'}, "
                f"{'TRUE' if unified.get('cheat_mention') else 'FALSE'})"
            )
            values.append(row)
        
        f.write(",\n".join(values))
        f.write(";\n\n")
        
        # COPY command for bulk import
        f.write(f"-- Alternative: Bulk import from CSV\n")
        f.write(f"-- COPY {table_name} FROM 'hiwosy_report_{timestamp}.csv' CSV HEADER;\n")
    
    return sql_file


def generate_powerbi_template(filepath, timestamp, total, unified_stats, dedup_rate, toxic_rate):
    """
    Generate Power BI Template (.pbit) compatible JSON configuration.
    
    This creates a configuration file that can be imported into Power BI
    to auto-generate visualizations.
    
    Note: True .pbit files are ZIP archives with specific structure.
    This generates a Power Query M script and dashboard config.
    """
    pbi_file = os.path.join("reports", f"hiwosy_powerbi_{timestamp}.json")
    
    # Power BI compatible configuration
    pbi_config = {
        "name": "HIWOSY Unified API Dashboard",
        "version": "1.0",
        "created": datetime.now().isoformat(),
        
        # Data source configuration
        "dataSource": {
            "type": "csv",
            "path": f"hiwosy_report_{timestamp}.csv",
            "encoding": "utf-8"
        },
        
        # KPI definitions
        "kpis": [
            {"name": "Total Queries", "value": total, "format": "number"},
            {"name": "Dedup Rate", "value": dedup_rate, "format": "percent"},
            {"name": "Cache Hit Rate", "value": unified_stats['cache_hits']/total*100 if total > 0 else 0, "format": "percent"},
            {"name": "Toxicity Rate", "value": toxic_rate, "format": "percent"}
        ],
        
        # Suggested visualizations
        "visualizations": [
            {
                "type": "card",
                "title": "Total Queries Analyzed",
                "measure": "COUNT(Row_ID)"
            },
            {
                "type": "donut",
                "title": "Deduplication Results",
                "dimension": "PRODUCT1_Status",
                "measure": "COUNT(*)"
            },
            {
                "type": "bar",
                "title": "Toxicity Actions Distribution",
                "dimension": "PRODUCT3_Action",
                "measure": "COUNT(*)"
            },
            {
                "type": "table",
                "title": "Top Toxic Messages",
                "columns": ["Query_Text", "PRODUCT3_Toxicity_Level", "PRODUCT3_Action"],
                "filter": "PRODUCT3_Toxicity_Level != 'SAFE'",
                "limit": 20
            }
        ],
        
        # Power Query M script for data transformation
        "powerQueryM": f'''
let
    Source = Csv.Document(File.Contents("hiwosy_report_{timestamp}.csv"), [Delimiter=",", Columns=16, Encoding=65001, QuoteStyle=QuoteStyle.None]),
    #"Promoted Headers" = Table.PromoteHeaders(Source, [PromoteAllScalars=true]),
    #"Changed Type" = Table.TransformColumnTypes(#"Promoted Headers", {{
        {{"Row_ID", Int64.Type}},
        {{"PRODUCT1_Similarity_Score", type number}},
        {{"PRODUCT3_Toxicity_Score", Int64.Type}}
    }})
in
    #"Changed Type"
''',
        
        # Slicer configurations
        "slicers": [
            {"field": "PRODUCT1_Status", "title": "Dedup Status"},
            {"field": "PRODUCT3_Toxicity_Level", "title": "Toxicity Level"},
            {"field": "PRODUCT3_Action", "title": "Recommended Action"}
        ]
    }
    
    with open(pbi_file, 'w', encoding='utf-8') as f:
        json.dump(pbi_config, f, indent=2, ensure_ascii=False)
    
    return pbi_file


# ═══════════════════════════════════════════════════════════════════════════════
# 📁 DATA FILTER REPORT SYSTEM - Filtered Output Generation
# ═══════════════════════════════════════════════════════════════════════════════

def generate_filtered_reports(results, timestamp, output_filters=None, output_formats=None):
    """
    Generate FILTERED reports - separate files for each category.
    
    Each filter generates files in ALL specified formats (CSV, JSON, XLSX, Parquet, SQL).
    
    Filters:
        1. ALL - Combined results (generated separately in main report)
        2. MASTERS - Only unique/original queries (MASTER status)
        3. DUPLICATES - Only duplicate queries (MERGE status)
        4. BAN - Queries with BAN action recommended
        5. WARNING - Queries with WARNING action
        6. SAFE - Clean queries (no toxicity detected)
        7. CACHE - Cache analysis (hits and misses separately)
    
    Args:
        results: List of API result dictionaries
        timestamp: Report timestamp for file naming
        output_filters: List of filters to generate (default: all)
        output_formats: List of formats to generate (default: csv, json, xlsx)
    
    Returns:
        Dictionary with filter names as keys and generated file paths as values
    """
    if output_filters is None:
        output_filters = DEFAULT_FILTERS
    if output_formats is None:
        output_formats = DEFAULT_FORMATS
    
    generated_files = {}
    total_results = len(results)
    
    print("\n" + "═" * 60)
    print("📁 DATA FILTER REPORT SYSTEM")
    print("   Generating separated files per category")
    print("═" * 60)
    print(f"   📋 Filters: {', '.join(output_filters)}")
    print(f"   📦 Formats: {', '.join(output_formats)}")
    print("─" * 60)
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 2: MASTERS (Unique/Original Queries)
    # ═══════════════════════════════════════════════════════════════════════════
    if "masters" in output_filters:
        masters = [r for r in results if not r.get("is_duplicate", False)]
        if masters:
            generated_files["masters"] = save_filtered_file(
                data=masters,
                filename_base=f"hiwosy_MASTERS_{timestamp}",
                filter_type="MASTERS",
                filter_description="Unique/Original Queries (MASTER status - keep as originals)",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(masters) / total_results * 100 if total_results > 0 else 0
            print(f"\n   ✅ MASTERS: {len(masters):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 3: DUPLICATES (Merge Candidates)
    # ═══════════════════════════════════════════════════════════════════════════
    if "duplicates" in output_filters:
        duplicates = [r for r in results if r.get("is_duplicate", False)]
        if duplicates:
            generated_files["duplicates"] = save_filtered_file(
                data=duplicates,
                filename_base=f"hiwosy_DUPLICATES_{timestamp}",
                filter_type="DUPLICATES",
                filter_description="Duplicate Queries (MERGE status - merge with master)",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(duplicates) / total_results * 100 if total_results > 0 else 0
            print(f"\n   🔗 DUPLICATES: {len(duplicates):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 4: BAN (High Toxicity - Immediate Action Required)
    # ═══════════════════════════════════════════════════════════════════════════
    if "ban" in output_filters:
        ban_queries = [r for r in results if r.get("unified", {}).get("action") == "BAN"]
        if ban_queries:
            generated_files["ban"] = save_filtered_file(
                data=ban_queries,
                filename_base=f"hiwosy_BAN_{timestamp}",
                filter_type="BAN",
                filter_description="High Toxicity - Immediate BAN Recommended",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(ban_queries) / total_results * 100 if total_results > 0 else 0
            print(f"\n   🚫 BAN: {len(ban_queries):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
        else:
            print(f"\n   🚫 BAN: 0 queries (no high toxicity detected)")
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 5: WARNING (Medium Toxicity)
    # ═══════════════════════════════════════════════════════════════════════════
    if "warning" in output_filters:
        warning_queries = [r for r in results if r.get("unified", {}).get("action") == "WARNING"]
        if warning_queries:
            generated_files["warning"] = save_filtered_file(
                data=warning_queries,
                filename_base=f"hiwosy_WARNING_{timestamp}",
                filter_type="WARNING",
                filter_description="Medium Toxicity - WARNING Recommended",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(warning_queries) / total_results * 100 if total_results > 0 else 0
            print(f"\n   ⚠️ WARNING: {len(warning_queries):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
        else:
            print(f"\n   ⚠️ WARNING: 0 queries (no medium toxicity detected)")
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 6: SAFE (Clean Content - No Toxicity)
    # ═══════════════════════════════════════════════════════════════════════════
    if "safe" in output_filters:
        safe_queries = [r for r in results 
                       if r.get("unified", {}).get("toxicity_level", "SAFE") == "SAFE"
                       and r.get("unified", {}).get("action", "NONE") in ["NONE", None, ""]]
        if safe_queries:
            generated_files["safe"] = save_filtered_file(
                data=safe_queries,
                filename_base=f"hiwosy_SAFE_{timestamp}",
                filter_type="SAFE",
                filter_description="Clean Content - No Toxicity Detected",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(safe_queries) / total_results * 100 if total_results > 0 else 0
            print(f"\n   ✅ SAFE: {len(safe_queries):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
    
    # ═══════════════════════════════════════════════════════════════════════════
    # FILTER 7: CACHE (API Cache Analysis - Hits and Misses)
    # ═══════════════════════════════════════════════════════════════════════════
    if "cache" in output_filters:
        # Cache HITS - Use cached response (duplicates)
        cache_hits = [r for r in results if r.get("unified", {}).get("cache_status") == "MERGE"]
        if cache_hits:
            generated_files["cache_hits"] = save_filtered_file(
                data=cache_hits,
                filename_base=f"hiwosy_CACHE_HITS_{timestamp}",
                filter_type="CACHE_HITS",
                filter_description="Cache HITS - Use Cached Response (API Cost Savings)",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(cache_hits) / total_results * 100 if total_results > 0 else 0
            print(f"\n   ⚡ CACHE HITS: {len(cache_hits):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
        
        # Cache MISSES - Store new response (unique queries)
        cache_misses = [r for r in results if r.get("unified", {}).get("cache_status") == "MASTER"]
        if cache_misses:
            generated_files["cache_misses"] = save_filtered_file(
                data=cache_misses,
                filename_base=f"hiwosy_CACHE_MISSES_{timestamp}",
                filter_type="CACHE_MISSES",
                filter_description="Cache MISSES - Store New Response (New Unique Queries)",
                total_original=total_results,
                formats=output_formats,
                timestamp=timestamp
            )
            pct = len(cache_misses) / total_results * 100 if total_results > 0 else 0
            print(f"\n   💾 CACHE MISSES: {len(cache_misses):,} queries ({pct:.1f}%)")
            print(f"      └─ Files: {', '.join(f'.{fmt}' for fmt in output_formats)}")
    
    print("\n" + "─" * 60)
    
    # Summary
    total_files = sum(len(files) for files in generated_files.values() if isinstance(files, dict))
    print(f"   📊 Total filtered files generated: {total_files}")
    
    return generated_files


def save_filtered_file(data, filename_base, filter_type, filter_description, 
                       total_original, formats, timestamp):
    """
    Save filtered data to ALL specified formats in organized subfolders.
    
    Args:
        data: List of result dictionaries for this filter
        filename_base: Base filename without extension
        filter_type: Type of filter (e.g., "MASTERS", "BAN")
        filter_description: Human-readable description
        total_original: Total count in original dataset
        formats: List of formats to generate
        timestamp: Report timestamp
    
    Returns:
        Dictionary with format names as keys and file paths as values
    """
    saved_files = {}
    count = len(data)
    percentage = count / total_original * 100 if total_original > 0 else 0
    
    # ═══════════════════════════════════════════════════════════════════════════
    # CREATE ORGANIZED SUBFOLDER STRUCTURE
    # ═══════════════════════════════════════════════════════════════════════════
    # Create reports/FILTER_TYPE/ folder structure
    reports_folder = "reports"
    filter_folder = os.path.join(reports_folder, filter_type)
    os.makedirs(filter_folder, exist_ok=True)
    
    # Update filename_base to include folder path
    filename_base = os.path.join(filter_folder, os.path.basename(filename_base))
    
    # ═══════════════════════════════════════════════════════════════════════════
    # CSV FORMAT
    # ═══════════════════════════════════════════════════════════════════════════
    if "csv" in formats:
        csv_file = f"{filename_base}.csv"
        with open(csv_file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            
            # Production headers with all 3 products
            writer.writerow([
                'Row_ID', 'Query_Text',
                # Product 1: Deduplication
                'PRODUCT1_Dedup_Status', 'PRODUCT1_Dedup_Action', 
                'PRODUCT1_Duplicate_Of', 'PRODUCT1_Similarity_Score', 'PRODUCT1_Match_Type',
                # Product 2: Cache
                'PRODUCT2_Cache_Status',
                # Product 3: Toxicity
                'PRODUCT3_Toxicity_Level', 'PRODUCT3_Toxicity_Score', 
                'PRODUCT3_Recommended_Action', 'PRODUCT3_Is_Spam', 
                'PRODUCT3_Cheat_Mention', 'PRODUCT3_Is_Repeated',
                # Metadata
                'Cluster_ID', 'Filter_Type'
            ])
            
            for row_num, r in enumerate(data, start=1):
                unified = r.get("unified", {})
                is_dup = r.get("is_duplicate", False)
                
                writer.writerow([
                    row_num,
                    r.get("query", "")[:500],
                    "DUPLICATE" if is_dup else "UNIQUE",
                    unified.get("dedup_status", "MASTER"),
                    r.get("duplicate_of", ""),
                    f"{r.get('confidence', 0):.4f}",
                    r.get("match_type", "") if is_dup else "",
                    unified.get("cache_status", "MASTER"),
                    unified.get("toxicity_level", "SAFE"),
                    unified.get("toxicity_score", 0),
                    unified.get("action", "NONE").upper(),
                    "YES" if unified.get("is_spam") else "NO",
                    "YES" if unified.get("cheat_mention") else "NO",
                    "YES" if unified.get("is_repeated") else "NO",
                    r.get("cluster_id", ""),
                    filter_type
                ])
        
        saved_files["csv"] = csv_file
    
    # ═══════════════════════════════════════════════════════════════════════════
    # JSON FORMAT - Professional Enterprise Wrapper
    # ═══════════════════════════════════════════════════════════════════════════
    if "json" in formats:
        json_file = f"{filename_base}.json"
        
        # Build professional JSON structure
        json_report = {
            # API Wrapper
            "api_version": "2.1.0",
            "run_id": f"hiwosy_run_{timestamp}_{uuid.uuid4().hex[:8]}",
            "status": "completed",
            "created_at": datetime.now(timezone.utc).isoformat(),
            
            # Filter Information
            "filter": {
                "type": filter_type,
                "description": filter_description,
                "query_count": count,
                "percentage_of_total": round(percentage, 2),
                "total_in_original_dataset": total_original
            },
            
            # Summary Statistics for this filter
            "summary": {
                "total_in_filter": count,
                "unique_count": sum(1 for r in data if not r.get("is_duplicate", False)),
                "duplicate_count": sum(1 for r in data if r.get("is_duplicate", False)),
                "toxic_count": sum(1 for r in data if r.get("unified", {}).get("is_toxic", False)),
                "safe_count": sum(1 for r in data if r.get("unified", {}).get("toxicity_level") == "SAFE"),
                "ban_count": sum(1 for r in data if r.get("unified", {}).get("action") == "BAN"),
                "warning_count": sum(1 for r in data if r.get("unified", {}).get("action") == "WARNING"),
                "spam_count": sum(1 for r in data if r.get("unified", {}).get("is_spam", False))
            },
            
            # Metadata
            "metadata": {
                "tool": "HIWOSY™ Unified API - Data Filter Report System",
                "documentation": "https://www.hiwosy.com/docs",
                "generated_by": "test_my_data.py",
                "timestamp": timestamp
            },
            
            # Per-Query Data
            "data": [
                {
                    "row_id": row_num,
                    "query_text": r.get("query", "")[:500],
                    "product1_dedup": {
                        "status": "DUPLICATE" if r.get("is_duplicate", False) else "UNIQUE",
                        "action": r.get("unified", {}).get("dedup_status", "MASTER"),
                        "duplicate_of": r.get("duplicate_of"),
                        "similarity_score": round(r.get("confidence", 0), 4),
                        "match_type": r.get("match_type", "") if r.get("is_duplicate") else None
                    },
                    "product2_cache": {
                        "status": r.get("unified", {}).get("cache_status", "MASTER")
                    },
                    "product3_toxicity": {
                        "level": r.get("unified", {}).get("toxicity_level", "SAFE"),
                        "score": r.get("unified", {}).get("toxicity_score", 0),
                        "action": r.get("unified", {}).get("action", "NONE").upper(),
                        "is_spam": r.get("unified", {}).get("is_spam", False),
                        "cheat_mention": r.get("unified", {}).get("cheat_mention", False),
                        "is_repeated": r.get("unified", {}).get("is_repeated", False)
                    },
                    "cluster_id": r.get("cluster_id")
                }
                for row_num, r in enumerate(data, start=1)
            ],
            
            # Warnings/Errors
            "warnings": [],
            "errors": []
        }
        
        # Add warnings if applicable
        if count == 0:
            json_report["warnings"].append({
                "code": "EMPTY_FILTER",
                "message": f"No queries matched the {filter_type} filter"
            })
        
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(json_report, f, indent=2, ensure_ascii=False)
        
        saved_files["json"] = json_file
    
    # ═══════════════════════════════════════════════════════════════════════════
    # XLSX FORMAT - Multi-Sheet Excel Workbook
    # ═══════════════════════════════════════════════════════════════════════════
    if "xlsx" in formats and EXCEL_AVAILABLE:
        xlsx_file = f"{filename_base}.xlsx"
        wb = openpyxl.Workbook()
        
        # Styles
        header_font = Font(bold=True, color="FFFFFF", size=11)
        header_fill = PatternFill(start_color="F97316", end_color="F97316", fill_type="solid")
        title_font = Font(bold=True, size=16, color="F97316")
        
        # Sheet 1: Summary
        ws_summary = wb.active
        ws_summary.title = "Summary"
        
        ws_summary['A1'] = f"🧠 HIWOSY™ - {filter_type} Report"
        ws_summary['A1'].font = title_font
        ws_summary.merge_cells('A1:C1')
        
        ws_summary['A3'] = "Filter Type"
        ws_summary['B3'] = filter_type
        ws_summary['A4'] = "Description"
        ws_summary['B4'] = filter_description
        ws_summary['A5'] = "Query Count"
        ws_summary['B5'] = count
        ws_summary['A6'] = "Percentage of Total"
        ws_summary['B6'] = f"{percentage:.2f}%"
        ws_summary['A7'] = "Total in Original Dataset"
        ws_summary['B7'] = total_original
        ws_summary['A8'] = "Generated At"
        ws_summary['B8'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        for row in range(3, 9):
            ws_summary.cell(row=row, column=1).font = Font(bold=True)
        
        ws_summary.column_dimensions['A'].width = 25
        ws_summary.column_dimensions['B'].width = 50
        
        # Sheet 2: Data
        ws_data = wb.create_sheet("Data")
        
        headers = [
            'Row_ID', 'Query_Text',
            'PRODUCT1_Status', 'PRODUCT1_Action', 'PRODUCT1_Similarity',
            'PRODUCT2_Cache_Status',
            'PRODUCT3_Toxicity', 'PRODUCT3_Score', 'PRODUCT3_Action',
            'Is_Spam', 'Cheat_Mention'
        ]
        
        for col, header in enumerate(headers, start=1):
            cell = ws_data.cell(row=1, column=col, value=header)
            cell.font = header_font
            cell.fill = header_fill
            cell.alignment = Alignment(horizontal='center')
        
        for row_num, r in enumerate(data, start=2):
            unified = r.get("unified", {})
            is_dup = r.get("is_duplicate", False)
            
            ws_data.cell(row=row_num, column=1, value=row_num - 1)
            ws_data.cell(row=row_num, column=2, value=r.get("query", "")[:200])
            ws_data.cell(row=row_num, column=3, value="DUPLICATE" if is_dup else "UNIQUE")
            ws_data.cell(row=row_num, column=4, value=unified.get("dedup_status", "MASTER"))
            ws_data.cell(row=row_num, column=5, value=round(r.get("confidence", 0), 4))
            ws_data.cell(row=row_num, column=6, value=unified.get("cache_status", "MASTER"))
            ws_data.cell(row=row_num, column=7, value=unified.get("toxicity_level", "SAFE"))
            ws_data.cell(row=row_num, column=8, value=unified.get("toxicity_score", 0))
            ws_data.cell(row=row_num, column=9, value=unified.get("action", "NONE").upper())
            ws_data.cell(row=row_num, column=10, value="YES" if unified.get("is_spam") else "NO")
            ws_data.cell(row=row_num, column=11, value="YES" if unified.get("cheat_mention") else "NO")
        
        # Auto-filter
        ws_data.auto_filter.ref = f"A1:K{len(data) + 1}"
        
        # Column widths
        ws_data.column_dimensions['A'].width = 8
        ws_data.column_dimensions['B'].width = 50
        for col in ['C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']:
            ws_data.column_dimensions[col].width = 16
        
        wb.save(xlsx_file)
        saved_files["xlsx"] = xlsx_file
    
    # ═══════════════════════════════════════════════════════════════════════════
    # PARQUET FORMAT - Columnar Storage for Big Data
    # ═══════════════════════════════════════════════════════════════════════════
    if "parquet" in formats and PARQUET_AVAILABLE:
        parquet_file = f"{filename_base}.parquet"
        
        parquet_data = {
            "row_id": [],
            "query_text": [],
            "product1_status": [],
            "product1_action": [],
            "product1_similarity": [],
            "product2_cache_status": [],
            "product3_toxicity_level": [],
            "product3_score": [],
            "product3_action": [],
            "product3_is_spam": [],
            "product3_cheat_mention": [],
            "filter_type": []
        }
        
        for row_num, r in enumerate(data, start=1):
            unified = r.get("unified", {})
            is_dup = r.get("is_duplicate", False)
            
            parquet_data["row_id"].append(row_num)
            parquet_data["query_text"].append(r.get("query", "")[:500])
            parquet_data["product1_status"].append("DUPLICATE" if is_dup else "UNIQUE")
            parquet_data["product1_action"].append(unified.get("dedup_status", "MASTER"))
            parquet_data["product1_similarity"].append(round(r.get("confidence", 0), 4))
            parquet_data["product2_cache_status"].append(unified.get("cache_status", "MASTER"))
            parquet_data["product3_toxicity_level"].append(unified.get("toxicity_level", "SAFE"))
            parquet_data["product3_score"].append(unified.get("toxicity_score", 0))
            parquet_data["product3_action"].append(unified.get("action", "NONE").upper())
            parquet_data["product3_is_spam"].append(unified.get("is_spam", False))
            parquet_data["product3_cheat_mention"].append(unified.get("cheat_mention", False))
            parquet_data["filter_type"].append(filter_type)
        
        table = pa.table(parquet_data)
        pq.write_table(table, parquet_file, compression='snappy')
        saved_files["parquet"] = parquet_file
    
    # ═══════════════════════════════════════════════════════════════════════════
    # SQL FORMAT - Ready-to-Execute INSERT Statements
    # ═══════════════════════════════════════════════════════════════════════════
    if "sql" in formats:
        sql_file = f"{filename_base}.sql"
        table_name = f"hiwosy_{filter_type.lower()}"
        
        with open(sql_file, 'w', encoding='utf-8') as f:
            f.write(f"-- HIWOSY™ {filter_type} Filter Results\n")
            f.write(f"-- Generated: {datetime.now().isoformat()}\n")
            f.write(f"-- Filter: {filter_description}\n")
            f.write(f"-- Records: {count}\n")
            f.write(f"-- ═══════════════════════════════════════════════════════════════\n\n")
            
            # CREATE TABLE
            f.write(f"CREATE TABLE IF NOT EXISTS {table_name} (\n")
            f.write(f"    id SERIAL PRIMARY KEY,\n")
            f.write(f"    query_text TEXT,\n")
            f.write(f"    product1_status VARCHAR(20),\n")
            f.write(f"    product1_action VARCHAR(20),\n")
            f.write(f"    product1_similarity DECIMAL(5,4),\n")
            f.write(f"    product2_cache_status VARCHAR(20),\n")
            f.write(f"    product3_toxicity_level VARCHAR(20),\n")
            f.write(f"    product3_score INTEGER,\n")
            f.write(f"    product3_action VARCHAR(20),\n")
            f.write(f"    product3_is_spam BOOLEAN,\n")
            f.write(f"    product3_cheat_mention BOOLEAN,\n")
            f.write(f"    filter_type VARCHAR(30),\n")
            f.write(f"    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n")
            f.write(f");\n\n")
            
            if data:
                # INSERT statements
                f.write(f"INSERT INTO {table_name} (\n")
                f.write(f"    query_text, product1_status, product1_action, product1_similarity,\n")
                f.write(f"    product2_cache_status, product3_toxicity_level, product3_score,\n")
                f.write(f"    product3_action, product3_is_spam, product3_cheat_mention, filter_type\n")
                f.write(f") VALUES\n")
                
                values = []
                for r in data:
                    unified = r.get("unified", {})
                    is_dup = r.get("is_duplicate", False)
                    query_text = r.get("query", "").replace("'", "''")[:500]
                    
                    row = (
                        f"('{query_text}', "
                        f"'{'DUPLICATE' if is_dup else 'UNIQUE'}', "
                        f"'{unified.get('dedup_status', 'MASTER')}', "
                        f"{round(r.get('confidence', 0), 4)}, "
                        f"'{unified.get('cache_status', 'MASTER')}', "
                        f"'{unified.get('toxicity_level', 'SAFE')}', "
                        f"{unified.get('toxicity_score', 0)}, "
                        f"'{unified.get('action', 'NONE').upper()}', "
                        f"{'TRUE' if unified.get('is_spam') else 'FALSE'}, "
                        f"{'TRUE' if unified.get('cheat_mention') else 'FALSE'}, "
                        f"'{filter_type}')"
                    )
                    values.append(row)
                
                f.write(",\n".join(values))
                f.write(";\n")
        
        saved_files["sql"] = sql_file
    
    return saved_files


def print_usage():
    """Print usage instructions."""
    print("""
Usage: python test_my_data.py <file.csv> [options]

Options:
    --column NAME      Specify which column contains the text to analyze
    --limit N          Only process first N rows (default: all)
    --dedup_scope X    Deduplication scope: batch|session|daily|historical
    --filters LIST     Comma-separated list of filters to generate
                       (all,masters,duplicates,ban,warning,safe,cache)
    --formats LIST     Comma-separated list of output formats
                       (csv,json,xlsx,parquet,sql)
    --help             Show this help message

Examples:
    python test_my_data.py tickets.csv
    python test_my_data.py data.csv --column "message"
    python test_my_data.py logs.csv --limit 500 --formats "csv,json"
    python test_my_data.py data.csv --filters "masters,ban,safe" --formats "csv,json,xlsx"

Filter Types:
    all        - All combined results (always generated)
    masters    - Only unique/original queries (MASTER status)
    duplicates - Only duplicate queries (MERGE status)
    ban        - Queries with BAN action recommended (high toxicity)
    warning    - Queries with WARNING action (medium toxicity)
    safe       - Clean queries (no toxicity detected)
    cache      - Cache analysis (hits and misses separately)

Output Formats:
    csv     - Comma-separated values (Excel, Google Sheets)
    json    - Professional JSON with metadata wrapper
    xlsx    - Multi-sheet Excel workbook (Power BI ready)
    parquet - Columnar format for big data (Spark, BigQuery)
    sql     - Ready-to-execute SQL INSERT statements

Supported input formats:
    - CSV with headers (auto-detects text column)
    - CSV without headers (uses first column)
    - One query per line text files
    """)


def main():
    print("═" * 60)
    print("🧠 HIWOSY™ SEMANTIC DEDUPLICATION TEST")
    print("   Powered by hiwosy.com")
    print("═" * 60)
    
    # Parse arguments
    if len(sys.argv) < 2 or "--help" in sys.argv:
        print_usage()
        sys.exit(0 if "--help" in sys.argv else 1)
    
    filepath = sys.argv[1]
    column = None
    limit = None  # Process all by default
    dedup_scope = "batch"  # Default: each run is independent
    output_filters = DEFAULT_FILTERS.copy()  # Default: all filters
    output_formats = DEFAULT_FORMATS.copy()  # Default: csv, json, xlsx
    
    # Valid options
    valid_scopes = ["batch", "session", "daily", "historical"]
    
    # Parse optional args
    args = sys.argv[2:]
    i = 0
    while i < len(args):
        if args[i] == "--column" and i + 1 < len(args):
            column = args[i + 1]
            i += 2
        elif args[i] == "--limit" and i + 1 < len(args):
            try:
                limit = int(args[i + 1])
            except ValueError:
                print(f"⚠️ Invalid limit value: {args[i + 1]}, using default (all)")
            i += 2
        elif args[i] == "--dedup_scope" and i + 1 < len(args):
            scope_val = args[i + 1].lower()
            if scope_val in valid_scopes:
                dedup_scope = scope_val
            else:
                print(f"⚠️ Invalid dedup_scope: {args[i + 1]}")
                print(f"   Valid options: {', '.join(valid_scopes)}")
                print(f"   Using default: batch")
            i += 2
        elif args[i] == "--filters" and i + 1 < len(args):
            # Parse filter list (comma-separated)
            filter_list = [f.strip().lower() for f in args[i + 1].split(",")]
            valid_filters = [f for f in filter_list if f in AVAILABLE_FILTERS]
            invalid_filters = [f for f in filter_list if f not in AVAILABLE_FILTERS]
            if invalid_filters:
                print(f"⚠️ Invalid filters: {', '.join(invalid_filters)}")
                print(f"   Valid options: {', '.join(AVAILABLE_FILTERS)}")
            if valid_filters:
                output_filters = valid_filters
            print(f"   📋 Filters enabled: {', '.join(output_filters)}")
            i += 2
        elif args[i] == "--formats" and i + 1 < len(args):
            # Parse format list (comma-separated)
            format_list = [f.strip().lower() for f in args[i + 1].split(",")]
            valid_fmts = [f for f in format_list if f in AVAILABLE_FORMATS]
            invalid_fmts = [f for f in format_list if f not in AVAILABLE_FORMATS]
            if invalid_fmts:
                print(f"⚠️ Invalid formats: {', '.join(invalid_fmts)}")
                print(f"   Valid options: {', '.join(AVAILABLE_FORMATS)}")
            if valid_fmts:
                output_formats = valid_fmts
            print(f"   📦 Formats enabled: {', '.join(output_formats)}")
            i += 2
        else:
            i += 1
    
    # Check file exists
    import os
    if not os.path.exists(filepath):
        print(f"\n❌ File not found: {filepath}")
        sys.exit(1)
    
    # Run connection tests
    if not test_api_health():
        print("\n❌ Cannot reach Hiwosy API.")
        print("   Check your internet connection and try again.")
        sys.exit(1)
    
    if not test_api_key():
        print("\n❌ API key validation failed.")
        print("   1. Open this script in a text editor")
        print("   2. Find line 33: API_KEY = \"YOUR_API_KEY_HERE\"")
        print("   3. Replace with your actual API key")
        print("   4. Save and run again")
        sys.exit(1)
    
    # Load data
    print(f"\n📂 Loading file: {filepath}")
    if limit:
        print(f"   (Limited to first {limit} rows)")
    
    try:
        queries = load_csv(filepath, column, limit)
    except Exception as e:
        print(f"❌ Failed to load file: {e}")
        sys.exit(1)
    
    print(f"   ✅ Loaded {len(queries):,} queries")
    
    if not queries:
        print("❌ No valid queries found in file!")
        print("   Check that your file has text content")
        sys.exit(1)
    
    # Show sample
    print("\n📝 Sample queries (first 3):")
    for i, q in enumerate(queries[:3]):
        print(f"   {i+1}. \"{q[:60]}{'...' if len(q) > 60 else ''}\"")
    
    # Confirm before processing
    print(f"\n⏳ Ready to process {len(queries):,} queries...")
    print(f"   📋 Dedup scope: {dedup_scope.upper()}")
    if dedup_scope == "batch":
        print("   ℹ️  (Each run independent - duplicates only within this file)")
    elif dedup_scope == "session":
        print("   ℹ️  (Comparing against queries from last 1 hour)")
    elif dedup_scope == "daily":
        print("   ℹ️  (Comparing against queries from last 24 hours)")
    elif dedup_scope == "historical":
        print("   ℹ️  (Comparing against ALL historical data)")
    
    # Process
    results = process_batch(queries, dedup_scope=dedup_scope)
    
    if not results:
        print("❌ No results received from API")
        sys.exit(1)
    
    # Generate comprehensive production reports (all enterprise formats)
    reports = generate_report(
        queries=queries,
        results=results,
        filepath=filepath,
        output_filters=output_filters,
        output_formats=output_formats
    )
    
    print("\n" + "═" * 70)
    print("✅ HIWOSY™ ANALYSIS COMPLETE - ENTERPRISE EDITION")
    print("═" * 70)
    
    print("\n📂 FILES GENERATED:")
    print("─" * 50)
    
    # Core Reports
    print("\n   📋 CORE REPORTS (All Combined):")
    if reports.get("json"):
        print(f"   ├─ 📊 {reports['json']}")
        print(f"   │     └─ JSON: Enterprise wrapper with audit trail")
    if reports.get("csv_detail"):
        print(f"   ├─ 📋 {reports['csv_detail']}")
        print(f"   │     └─ CSV: Per-query details (all 3 products)")
    if reports.get("csv_summary"):
        print(f"   ├─ 📈 {reports['csv_summary']}")
        print(f"   │     └─ CSV: Executive summary statistics")
    if reports.get("html"):
        print(f"   └─ 🌐 {reports['html']}")
        print(f"         └─ HTML: Visual dashboard")
    
    # Enterprise Reports
    print("\n   🏢 ENTERPRISE REPORTS:")
    if reports.get("excel"):
        print(f"   ├─ 📗 {reports['excel']}")
        print(f"   │     └─ XLSX: Multi-sheet workbook (Power BI ready)")
    if reports.get("parquet"):
        print(f"   ├─ 📦 {reports['parquet']}")
        print(f"   │     └─ PARQUET: Columnar format (5-10x smaller)")
    if reports.get("sql"):
        print(f"   ├─ 🗄️ {reports['sql']}")
        print(f"   │     └─ SQL: Ready-to-execute INSERT statements")
    if reports.get("powerbi"):
        print(f"   └─ 📊 {reports['powerbi']}")
        print(f"         └─ JSON: Power BI dashboard configuration")
    
    # Filtered Reports
    filtered = reports.get("filtered", {})
    if filtered:
        print("\n   📁 FILTERED REPORTS (Separated by Category):")
        filter_icons = {
            "masters": "✅",
            "duplicates": "🔗",
            "ban": "🚫",
            "warning": "⚠️",
            "safe": "💚",
            "cache_hits": "⚡",
            "cache_misses": "💾"
        }
        filter_names = {
            "masters": "MASTERS (Unique/Original)",
            "duplicates": "DUPLICATES (Merge Candidates)",
            "ban": "BAN (High Toxicity)",
            "warning": "WARNING (Medium Toxicity)",
            "safe": "SAFE (Clean Content)",
            "cache_hits": "CACHE HITS (Use Cached)",
            "cache_misses": "CACHE MISSES (Store New)"
        }
        
        filter_items = list(filtered.items())
        for idx, (filter_name, files) in enumerate(filter_items):
            icon = filter_icons.get(filter_name, "📄")
            name = filter_names.get(filter_name, filter_name.upper())
            is_last = idx == len(filter_items) - 1
            prefix = "   └─" if is_last else "   ├─"
            
            if isinstance(files, dict):
                formats_str = ", ".join(f".{fmt}" for fmt in files.keys())
                print(f"{prefix} {icon} {name}")
                print(f"   {'   ' if is_last else '│  '}    └─ Formats: {formats_str}")
    
    print("\n" + "─" * 50)
    print("📝 FORMAT GUIDE:")
    print("   ├─ JSON    → API integration, versioning, audit trail")
    print("   ├─ CSV     → Quick analysis, Excel, Google Sheets")
    print("   ├─ XLSX    → Power BI, Excel with filters & charts")
    print("   ├─ PARQUET → Big data: Spark, BigQuery, Snowflake")
    print("   ├─ SQL     → Direct database import (PostgreSQL, MySQL)")
    print("   └─ HTML    → Share with non-technical stakeholders")
    
    print("\n📁 FILTER TYPES:")
    print("   ├─ MASTERS    → Only unique/original queries (keep as-is)")
    print("   ├─ DUPLICATES → Only duplicate queries (merge candidates)")
    print("   ├─ BAN        → High toxicity (immediate action required)")
    print("   ├─ WARNING    → Medium toxicity (needs review)")
    print("   ├─ SAFE       → Clean content (no issues)")
    print("   └─ CACHE      → API cache efficiency (hits vs misses)")
    
    print("\n📖 COLUMN LEGEND:")
    print("   PRODUCT 1 = Semantic Deduplication")
    print("   PRODUCT 2 = Semantic Cache")
    print("   PRODUCT 3 = User Behavior Detection")
    
    print("\n🚀 NEXT STEPS:")
    print("   1. Open Excel file in Power BI for instant dashboard")
    print("   2. Import Parquet to BigQuery/Snowflake for analytics")
    print("   3. Run SQL file to populate your database")
    print("   4. Use filtered files for targeted actions:")
    print("      - BAN file → Auto-ban pipeline")
    print("      - DUPLICATES file → Data cleanup")
    print("      - CACHE HITS file → API cost analysis")
    print("   5. Questions? Contact hiwosy.com")
    print()


if __name__ == "__main__":
    main()

# ================================================================================
# HIWOSY™ MEMORY LAYER  V R C R
# ================================================================================
# Self-Learning Semantic Deduplication Technology
#
# Patent Pending: USPTO 63/911,048, 63/915,918, 63/921,878
# Copyright Cases: 1-15035049381, 1-15034439898
# Contact: Ljubisa Kovacevic | kovalubo@gmail.com
# ================================================================================

