#!/usr/bin/env python3

import ftplib
import os
import re
import time
import unicodedata
from pathlib import Path
from collections import defaultdict

# ============================================================
# CONFIG
# ============================================================

FTP_HOST = "ftp.djdownload.me"
FTP_USER = "ludovic.m.santos"
FTP_PASS = "3af32ssH"
FTP_PORT = 21

FTP_BASE = "/home/ftp.djdownload.me"

YEARS = [
    "2024-DJdownload",
    "2025-DJdownload",
    "2026-DJdownload",
]

SHORTLIST_DIR = Path("/volume1/dj-ai/app/shortlist")
DOWNLOAD_ROOT = Path("/volume1/dj-ai/app/downloads")

AUDIO_EXT = (".mp3", ".wav")

MATCH_THRESHOLD = 0.92
DOWNLOAD_DELAY = 5
FTP_TIMEOUT = 30

IGNORE_WORDS = {
    "mix", "original", "extended", "edit", "club",
    "radio", "dub", "instrumental", "version", "remix"
}

# ============================================================
# HELPERS
# ============================================================

def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("ascii")

    text = text.lower()
    text = re.sub(r"\(.*?\)", "", text)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"[^a-z0-9]+", " ", text)

    tokens = [t for t in text.split() if t not in IGNORE_WORDS]
    return " ".join(tokens)

def token_similarity(a: str, b: str) -> float:
    sa, sb = set(a.split()), set(b.split())
    if not sa or not sb:
        return 0.0
    return len(sa & sb) / len(sa)

# ============================================================
# LOAD SHORTLIST
# ============================================================

def load_shortlist():
    by_genre = defaultdict(list)

    if not SHORTLIST_DIR.exists():
        raise RuntimeError(f"Shortlist not found: {SHORTLIST_DIR}")

    for genre_dir in SHORTLIST_DIR.iterdir():
        if not genre_dir.is_dir():
            continue

        genre = genre_dir.name

        for f in genre_dir.iterdir():
            if f.suffix.lower() not in AUDIO_EXT:
                continue

            by_genre[genre].append({
                "file": f.name,
                "norm": normalize(f.stem),
            })

        print(f"📁 Loaded {len(by_genre[genre])} tracks for genre: {genre}")

    return by_genre

# ============================================================
# MAIN DOWNLOADER
# ============================================================

def download_tracks():
    shortlist = load_shortlist()
    if not shortlist:
        raise RuntimeError("Shortlist is empty")

    ftp = ftplib.FTP(timeout=FTP_TIMEOUT)
    ftp.connect(FTP_HOST, FTP_PORT)
    ftp.login(FTP_USER, FTP_PASS)
    ftp.set_pasv(True)
    ftp.cwd(FTP_BASE)

    scanned = 0
    downloaded = 0
    downloaded_norms = set()
    missing = []

    print("\n🎧 STARTING CONTROLLED DJDOWNLOAD FETCH\n")

    for year in YEARS:
        try:
            ftp.cwd(year)
            print(f"\n📅 Year: {year}")
        except:
            continue

        for month in ftp.nlst():
            try:
                ftp.cwd(month)
            except:
                continue

            for day in ftp.nlst():
                try:
                    ftp.cwd(day)
                except:
                    continue

                for local_genre, shortlist_tracks in shortlist.items():
                    ftp_genre = local_genre.replace("_", " ")

                    try:
                        ftp.cwd(ftp_genre)
                    except:
                        continue

                    target_dir = DOWNLOAD_ROOT / year.replace("-DJdownload", "") / ftp_genre
                    target_dir.mkdir(parents=True, exist_ok=True)

                    for fname in ftp.nlst():
                        if not fname.lower().endswith(AUDIO_EXT):
                            continue

                        scanned += 1
                        ftp_norm = normalize(os.path.splitext(fname)[0])

                        if ftp_norm in downloaded_norms:
                            continue

                        for s in shortlist_tracks:
                            if token_similarity(s["norm"], ftp_norm) >= MATCH_THRESHOLD:
                                out = target_dir / fname

                                if out.exists():
                                    downloaded_norms.add(ftp_norm)
                                    break

                                print(f"⬇️ Downloading: {fname}")

                                try:
                                    with open(out, "wb") as f:
                                        ftp.retrbinary(f"RETR {fname}", f.write)

                                    downloaded += 1
                                    downloaded_norms.add(ftp_norm)
                                    time.sleep(DOWNLOAD_DELAY)

                                except Exception as e:
                                    print(f"⚠️ Failed: {fname} ({e})")

                                break

                    ftp.cwd("..")

                ftp.cwd("..")
            ftp.cwd("..")
        ftp.cwd("..")

    # Find missing
    for genre, tracks in shortlist.items():
        for t in tracks:
            if t["norm"] not in downloaded_norms:
                missing.append(f"{genre} / {t['file']}")

    ftp.quit()

    print("\n==============================")
    print(f"🔎 Scanned files: {scanned}")
    print(f"⬇️ Downloaded: {downloaded}")
    print(f"❌ Missing: {len(missing)}")
    print("==============================\n")

    if missing:
        print("❌ NOT FOUND:")
        for m in missing:
            print(" -", m)

# ============================================================
# ENTRY
# ============================================================

if __name__ == "__main__":
    download_tracks()
