#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Tests unitaires + integration pour l'algorithme de discrimination v4.9e.
"""

import sys
import os
import unittest

# Ajouter agents_python au path
sys.path.insert(0, os.path.join(os.path.dirname(__file__),
                                "agents_python"))

from media_fetcher import (
    _classify_url_type,
    _extract_activity_domains,
    _extract_identity_signals,
    _signals_compatible,
    _cluster_signals,
    _decide_discrimination,
    _strip_accents,
)


# ══════════════════════════════════════════════════════════════════════
# TESTS UNITAIRES
# ══════════════════════════════════════════════════════════════════════

class TestClassifyUrlType(unittest.TestCase):
    """Tests pour _classify_url_type"""

    def test_linkedin(self):
        self.assertEqual(
            _classify_url_type(
                "https://www.linkedin.com/in/pascal-herard"),
            "linkedin")
        self.assertEqual(
            _classify_url_type(
                "https://fr.linkedin.com/in/someone"),
            "linkedin")

    def test_social(self):
        self.assertEqual(
            _classify_url_type(
                "https://github.com/someone"),
            "social")
        self.assertEqual(
            _classify_url_type(
                "https://www.muckrack.com/pascal-herard"),
            "social")
        self.assertEqual(
            _classify_url_type(
                "https://babelio.com/auteur/someone"),
            "social")

    def test_noise(self):
        self.assertEqual(
            _classify_url_type(
                "https://fr.wikipedia.org/wiki/Someone"),
            "noise")
        self.assertEqual(
            _classify_url_type(
                "https://www.societe.com/societe/foo"),
            "noise")
        self.assertEqual(
            _classify_url_type(
                "https://pappers.fr/entreprise/foo"),
            "noise")

    def test_press(self):
        self.assertEqual(
            _classify_url_type(
                "https://www.lemonde.fr/article/foo"),
            "press")
        self.assertEqual(
            _classify_url_type(
                "https://www.mediapart.fr/journal/foo"),
            "press")

    def test_other(self):
        self.assertEqual(
            _classify_url_type(
                "https://www.random-site.com/page"),
            "other")


class TestExtractActivityDomains(unittest.TestCase):
    """Tests pour _extract_activity_domains"""

    def test_sante(self):
        result = _extract_activity_domains(
            "Le docteur est un psychiatre renomme")
        self.assertIn("sante", result)

    def test_journalisme(self):
        result = _extract_activity_domains(
            "Pascal Herard est journaliste a TV5 Monde")
        self.assertIn("journalisme", result)

    def test_multiple_domains(self):
        result = _extract_activity_domains(
            "Le professeur et chercheur est aussi ecrivain")
        self.assertIn("recherche", result)
        self.assertIn("culture", result)

    def test_none(self):
        result = _extract_activity_domains(
            "Le chat est sur la table")
        self.assertEqual(len(result), 0)

    def test_accents_stripped(self):
        result = _extract_activity_domains(
            "Il est médecin et professeur à l'université")
        self.assertIn("sante", result)
        self.assertIn("recherche", result)


class TestExtractIdentitySignals(unittest.TestCase):
    """Tests pour _extract_identity_signals"""

    def test_basic_signal(self):
        result = {"url": "https://lemonde.fr/article",
                  "title": "Pascal Herard journaliste",
                  "snippet": "Pascal Herard travaille a TV5"}
        sig = _extract_identity_signals(
            result, ["pascal", "herard"], "pascal herard")
        self.assertIsNotNone(sig)
        self.assertEqual(sig["url_type"], "press")
        self.assertTrue(sig["name_in_title"])
        self.assertIn("journalisme", sig["activity_domains"])

    def test_noise_filtered(self):
        result = {"url": "https://fr.wikipedia.org/wiki/Someone",
                  "title": "Someone",
                  "snippet": "Someone is someone"}
        sig = _extract_identity_signals(
            result, ["some", "one"], "some one")
        self.assertIsNone(sig)

    def test_name_absent(self):
        result = {"url": "https://lemonde.fr/article",
                  "title": "Article random",
                  "snippet": "Rien a voir"}
        sig = _extract_identity_signals(
            result, ["pascal", "herard"], "pascal herard")
        self.assertIsNone(sig)

    def test_linkedin_profession(self):
        result = {
            "url": "https://fr.linkedin.com/in/pascal-herard",
            "title": "Pascal Herard - TV5 Monde",
            "snippet": "Journaliste - Paris, France"
        }
        sig = _extract_identity_signals(
            result, ["pascal", "herard"], "pascal herard")
        self.assertIsNotNone(sig)
        self.assertEqual(sig["url_type"], "linkedin")
        self.assertEqual(sig["profession"], "Journaliste")

    def test_name_in_domain(self):
        result = {
            "url": "https://pascalherard.fr/bio",
            "title": "Pascal Herard - Bio",
            "snippet": "Site officiel de Pascal Herard"
        }
        sig = _extract_identity_signals(
            result, ["pascal", "herard"], "pascal herard")
        self.assertIsNotNone(sig)
        self.assertTrue(sig["name_in_domain"])


class TestSignalsCompatible(unittest.TestCase):
    """Tests pour _signals_compatible"""

    def test_same_activity(self):
        a = {"activity_domains": {"journalisme"},
             "url_type": "press",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": False}
        b = {"activity_domains": {"journalisme"},
             "url_type": "other",
             "name_in_title": True,
             "name_in_snippet": False,
             "name_in_domain": False}
        self.assertTrue(_signals_compatible(a, b))

    def test_linkedin_strong(self):
        a = {"activity_domains": set(),
             "url_type": "linkedin",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": False}
        b = {"activity_domains": set(),
             "url_type": "press",
             "name_in_title": True,
             "name_in_snippet": False,
             "name_in_domain": False}
        self.assertTrue(_signals_compatible(a, b))

    def test_no_overlap(self):
        a = {"activity_domains": {"sport"},
             "url_type": "other",
             "name_in_title": False,
             "name_in_snippet": False,
             "name_in_domain": False}
        b = {"activity_domains": {"sante"},
             "url_type": "other",
             "name_in_title": False,
             "name_in_snippet": False,
             "name_in_domain": False}
        self.assertFalse(_signals_compatible(a, b))

    def test_personal_site(self):
        a = {"activity_domains": set(),
             "url_type": "other",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": True}
        b = {"activity_domains": set(),
             "url_type": "press",
             "name_in_title": True,
             "name_in_snippet": False,
             "name_in_domain": False}
        self.assertTrue(_signals_compatible(a, b))


class TestClusterSignals(unittest.TestCase):
    """Tests pour _cluster_signals"""

    def test_empty(self):
        self.assertEqual(_cluster_signals([]), [])

    def test_single(self):
        signals = [{"activity_domains": {"sante"},
                    "url_type": "press",
                    "name_in_title": True,
                    "name_in_snippet": True,
                    "name_in_domain": False}]
        clusters = _cluster_signals(signals)
        self.assertEqual(len(clusters), 1)
        self.assertEqual(len(clusters[0]), 1)

    def test_two_compatible(self):
        signals = [
            {"activity_domains": {"journalisme"},
             "url_type": "press",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": False},
            {"activity_domains": {"journalisme"},
             "url_type": "other",
             "name_in_title": True,
             "name_in_snippet": False,
             "name_in_domain": False},
        ]
        clusters = _cluster_signals(signals)
        self.assertEqual(len(clusters), 1)
        self.assertEqual(len(clusters[0]), 2)

    def test_two_incompatible(self):
        signals = [
            {"activity_domains": {"sport"},
             "url_type": "other",
             "name_in_title": False,
             "name_in_snippet": False,
             "name_in_domain": False},
            {"activity_domains": {"sante"},
             "url_type": "other",
             "name_in_title": False,
             "name_in_snippet": False,
             "name_in_domain": False},
        ]
        clusters = _cluster_signals(signals)
        self.assertEqual(len(clusters), 2)

    def test_sorted_by_size(self):
        signals = [
            {"activity_domains": {"journalisme"},
             "url_type": "press",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": False},
            {"activity_domains": {"sport"},
             "url_type": "other",
             "name_in_title": False,
             "name_in_snippet": False,
             "name_in_domain": False},
            {"activity_domains": {"journalisme"},
             "url_type": "other",
             "name_in_title": True,
             "name_in_snippet": False,
             "name_in_domain": False},
            {"activity_domains": {"journalisme"},
             "url_type": "social",
             "name_in_title": True,
             "name_in_snippet": True,
             "name_in_domain": False},
        ]
        clusters = _cluster_signals(signals)
        # 3 journalisme + 1 sport → 2 clusters
        self.assertEqual(len(clusters), 2)
        self.assertEqual(len(clusters[0]), 3)  # dominant
        self.assertEqual(len(clusters[1]), 1)


class TestDecideDiscrimination(unittest.TestCase):
    """Tests pour _decide_discrimination"""

    def test_empty(self):
        result = _decide_discrimination([], "Test Person")
        self.assertEqual(result["status"], "non_discriminable")
        self.assertEqual(result["confidence"], "low")

    def test_identifiable_large_cluster(self):
        cluster = [
            {"url_type": "linkedin", "activity_domains": {"tech"},
             "profession": "CTO", "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
            {"url_type": "press", "activity_domains": {"tech"},
             "profession": None, "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
            {"url_type": "social", "activity_domains": {"tech"},
             "profession": None, "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
            {"url_type": "other", "activity_domains": {"tech"},
             "profession": None, "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
        ]
        result = _decide_discrimination(
            [cluster], "Test Person")
        self.assertEqual(result["status"], "identifiable")
        self.assertEqual(result["confidence"], "high")
        self.assertEqual(result["profession"], "CTO")
        self.assertEqual(result["activity_domain"], "tech")

    def test_partial_small_cluster(self):
        cluster = [
            {"url_type": "press", "activity_domains": {"sante"},
             "profession": None, "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
            {"url_type": "other", "activity_domains": {"sante"},
             "profession": None, "name_in_title": True,
             "name_in_snippet": True, "name_in_domain": False},
        ]
        result = _decide_discrimination(
            [cluster], "Test Person")
        self.assertEqual(result["status"], "partial")
        self.assertEqual(result["confidence"], "low")

    def test_non_discriminable_single(self):
        cluster = [
            {"url_type": "other", "activity_domains": set(),
             "profession": None, "name_in_title": False,
             "name_in_snippet": True, "name_in_domain": False},
        ]
        result = _decide_discrimination(
            [cluster], "Test Person")
        self.assertEqual(result["status"], "non_discriminable")


# ══════════════════════════════════════════════════════════════════════
# TESTS INTEGRATION (appels Serper reels)
# ══════════════════════════════════════════════════════════════════════

class TestDiscriminatePersonIntegration(unittest.TestCase):
    """Tests integration avec appels Serper reels.
    7 cas obligatoires du prompt v4.9e.
    """

    @classmethod
    def setUpClass(cls):
        """Verifier que SERPER_API_KEY est presente."""
        from dotenv import load_dotenv
        load_dotenv()
        key = os.getenv("SERPER_API_KEY", "")
        if not key:
            raise unittest.SkipTest(
                "SERPER_API_KEY manquante")
        cls.fetcher = __import__(
            "media_fetcher", fromlist=["MediaFetcher"]
        ).MediaFetcher()

    def _run_discrimination(self, query):
        """Helper : executer discriminate_person et afficher."""
        result = self.fetcher.discriminate_person(query)
        identity = result["identity"]
        notoriete = result["notoriete"]
        meta = notoriete.get("meta", {})
        print(f"\n{'='*60}")
        print(f"QUERY: {query}")
        print(f"STATUS: {meta.get('discrimination_status')}")
        print(f"CONFIDENCE: {identity.get('confidence')}")
        print(f"PROFESSION: {identity.get('profession')}")
        print(f"ACTIVITY: {identity.get('activity_domain')}")
        print(f"CLUSTERS: {meta.get('clusters_count')}")
        print(f"DOMINANT: {meta.get('dominant_cluster_size')}")
        print(f"SOURCES: {len(notoriete.get('sources', []))}")
        print(f"PORTRAIT: {'OUI' if notoriete.get('portrait') else 'NON'}")
        print(f"CREDITS: {meta.get('serper_credits')}")
        print(f"ELAPSED: {meta.get('elapsed_ms')}ms")
        for s in notoriete.get("sources", []):
            print(f"  [{s.get('sub_type')}] "
                  f"{s.get('domain')} — {s.get('title', '')[:60]}")
        print(f"{'='*60}")
        return result

    def test_01_emmanuel_macron(self):
        """Personne tres connue : identifiable, confidence high."""
        r = self._run_discrimination("Emmanuel Macron")
        identity = r["identity"]
        meta = r["notoriete"]["meta"]
        self.assertEqual(
            meta["discrimination_status"], "identifiable")
        self.assertIn(
            identity["confidence"], ("high", "medium"))

    def test_02_pascal_herard(self):
        """Journaliste TV5 : identifiable, journalisme."""
        r = self._run_discrimination("Pascal Hérard")
        identity = r["identity"]
        meta = r["notoriete"]["meta"]
        self.assertIn(
            meta["discrimination_status"],
            ("identifiable", "partial"))
        # Doit avoir au moins 1 source
        self.assertGreater(
            len(r["notoriete"]["sources"]), 0)

    def test_03_marie_leboiteux(self):
        """Journaliste France 24 : identifiable."""
        r = self._run_discrimination("Marie Le Boiteux")
        identity = r["identity"]
        meta = r["notoriete"]["meta"]
        self.assertIn(
            meta["discrimination_status"],
            ("identifiable", "partial"))

    def test_04_khelil_ben_osman(self):
        """Entrepreneur tech : identifiable ou partial."""
        r = self._run_discrimination("Khelil Ben Osman")
        identity = r["identity"]
        meta = r["notoriete"]["meta"]
        self.assertIn(
            meta["discrimination_status"],
            ("identifiable", "partial"))
        # Pas de registre commercial dans les sources
        for s in r["notoriete"]["sources"]:
            self.assertNotIn("societe.com", s.get("url", ""))
            self.assertNotIn("pappers.fr", s.get("url", ""))

    def test_05_gil_charpenet(self):
        """Personne peu connue : partial ou non_discriminable."""
        r = self._run_discrimination("Gil Charpenet")
        # Accepter tout status — le point est que
        # l'algo ne plante pas
        identity = r["identity"]
        self.assertIn(
            identity["confidence"],
            ("low", "medium", "high"))

    def test_06_jean_martin(self):
        """Nom commun, homonymes : non_discriminable ou partial."""
        r = self._run_discrimination("Jean Martin")
        meta = r["notoriete"]["meta"]
        # Avec un nom aussi commun, la discrimination
        # devrait detecter l'ambiguite
        self.assertIn(
            meta["discrimination_status"],
            ("non_discriminable", "partial", "identifiable"))

    def test_07_prenom_invente(self):
        """Nom invente : non_discriminable."""
        r = self._run_discrimination(
            "Zylbertox Monfranquibet")
        meta = r["notoriete"]["meta"]
        self.assertEqual(
            meta["discrimination_status"],
            "non_discriminable")
        self.assertEqual(
            len(r["notoriete"]["sources"]), 0)


if __name__ == "__main__":
    unittest.main(verbosity=2)
