#!/usr/bin/env python3

import ftplib
import os
import re
from pathlib import Path
from collections import defaultdict

# ============================================================
# CONFIG
# ============================================================

FTP_HOST = "ftp.djdownload.me"
FTP_USER = "ludovic.m.santos"
FTP_PASS = "3af32ssH"
FTP_PORT = 21

FTP_BASE = "/home/ftp.djdownload.me"

YEARS = [
    "2024-DJdownload",
    "2025-DJdownload",
    "2026-DJdownload",
]

SHORTLIST_DIR = Path("/volume1/dj-ai/shortlist")

AUDIO_EXT = (".mp3", ".wav")

# Token overlap threshold (after mix-word removal)
MATCH_THRESHOLD = 0.92

FTP_TIMEOUT = 25
PROGRESS_EVERY = 50

# ============================================================
# GENRE ALIASES
# ============================================================

GENRE_ALIASES = {
    "House": [
        "House",
        "House & Disco",
        "House / Deep",
        "House (Classic)",
    ],
}

# ============================================================
# WORD FILTERS (THE IMPORTANT PART)
# ============================================================

# These words will be COMPLETELY ignored
# They do NOT define a different track musically
IGNORE_WORDS = {
    "mix",
    "original",
    "extended",
    "edit",
    "club",
    "radio",
    "dub",
    "instrumental",
    "version",
    "remix",
}

# ============================================================
# HELPERS
# ============================================================

def normalize(name: str) -> str:
    name = name.lower()

    # Remove artist if present
    if " - " in name:
        name = name.split(" - ", 1)[1]

    # Remove brackets
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"\[.*?\]", "", name)

    # Normalize separators
    name = re.sub(r"[^a-z0-9]+", " ", name)

    tokens = [
        t for t in name.split()
        if t not in IGNORE_WORDS
    ]

    return " ".join(tokens)

def token_similarity(a: str, b: str) -> float:
    a_tokens = set(a.split())
    b_tokens = set(b.split())

    if not a_tokens or not b_tokens:
        return 0.0

    return len(a_tokens & b_tokens) / len(a_tokens)

# ============================================================
# LOAD SHORTLIST (GROUPED BY GENRE)
# ============================================================

def load_shortlist():
    by_genre = defaultdict(list)

    for genre_dir in SHORTLIST_DIR.iterdir():
        if not genre_dir.is_dir():
            continue

        genre = genre_dir.name

        for f in genre_dir.iterdir():
            if f.suffix.lower() in AUDIO_EXT:
                by_genre[genre].append({
                    "file": f.name,
                    "norm": normalize(f.stem),
                })

    return by_genre

# ============================================================
# MAIN SCAN (DEDUPLICATED)
# ============================================================

def build_report():
    shortlist_by_genre = load_shortlist()
    total_shortlist = sum(len(v) for v in shortlist_by_genre.values())

    print("\n====================================")
    print("🎧 DJDOWNLOAD — CONFIRMATION REPORT")
    print("====================================\n")
    print(f"Total shortlist tracks: {total_shortlist}\n")

    ftp = ftplib.FTP(timeout=FTP_TIMEOUT)
    ftp.connect(FTP_HOST, FTP_PORT)
    ftp.login(FTP_USER, FTP_PASS)
    ftp.set_pasv(True)
    ftp.cwd(FTP_BASE)

    seen_ftp_tracks = set()

    summary = defaultdict(lambda: defaultdict(int))
    matches = []

    scanned_files = 0

    for year in YEARS:
        try:
            ftp.cwd(year)
            print(f"\n📅 Year: {year}")
        except:
            continue

        for month in ftp.nlst():
            try:
                ftp.cwd(month)
                print(f"  📂 Month: {month}")
            except:
                continue

            for day in ftp.nlst():
                try:
                    ftp.cwd(day)
                except:
                    continue

                for local_genre, shortlist_tracks in shortlist_by_genre.items():
                    ftp_genres = GENRE_ALIASES.get(local_genre, [local_genre])

                    for ftp_genre in ftp_genres:
                        try:
                            ftp.cwd(ftp_genre)
                        except:
                            continue

                        for f in ftp.nlst():
                            if not f.lower().endswith(AUDIO_EXT):
                                continue

                            ftp_id = (year, ftp_genre, f)
                            if ftp_id in seen_ftp_tracks:
                                continue

                            scanned_files += 1
                            ftp_norm = normalize(os.path.splitext(f)[0])

                            for s in shortlist_tracks:
                                score = token_similarity(s["norm"], ftp_norm)
                                if score >= MATCH_THRESHOLD:
                                    seen_ftp_tracks.add(ftp_id)
                                    summary[year][local_genre] += 1
                                    matches.append(
                                        (s["file"], year, local_genre, ftp_genre)
                                    )
                                    break

                            if scanned_files % PROGRESS_EVERY == 0:
                                print(
                                    f"    ⏳ scanned {scanned_files} files | "
                                    f"unique matches {len(seen_ftp_tracks)}",
                                    flush=True
                                )

                        ftp.cwd("..")

                ftp.cwd("..")
            ftp.cwd("..")
        ftp.cwd("..")

    ftp.quit()

    # ========================================================
    # REPORT
    # ========================================================

    print("\n====================================")
    print(f"🎯 UNIQUE MATCHES FOUND: {len(seen_ftp_tracks)}")
    print("====================================\n")

    for year, genres in summary.items():
        print(year)
        for genre, count in sorted(genres.items()):
            print(f"  {genre:<25} {count}")
        print("")

    print("------------------------------------")
    print("Example matches:")
    for m in matches[:10]:
        print(f" - {m[0]}  →  {m[1]} / {m[3]}")

    print("\nNO FILES WERE DOWNLOADED\n")

# ============================================================
# ENTRY
# ============================================================

if __name__ == "__main__":
    build_report()