# -*- coding: utf-8 -*- # Dépendances: # py -3.13 -m pip install -U yt-dlp youtube-transcript-api import re, csv, time, random from pathlib import Path from yt_dlp import YoutubeDL from youtube_transcript_api import ( YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, ) # --- Paramètres --- PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLKu5WmfJIJu8ruwTWc7ibegysYYqYby1t" START_NUM = 68 END_NUM = 159 OUTPUT_DIR = Path("transcripts") OUTPUT_DIR.mkdir(exist_ok=True) CSV_PATH = OUTPUT_DIR / "index.csv" # Priorités de langues (FR -> EN -> auto/any) LANG_PRIORITY = [["fr", "fr-FR"], ["en", "en-US"], ["auto"]] # --- Utils --- def fmt_time(seconds: float) -> str: """Format HH:MM:SS.mmm à partir de secondes (float).""" ms = int(round(seconds * 1000)) h = ms // 3_600_000; ms %= 3_600_000 m = ms // 60_000; ms %= 60_000 s = ms // 1000; ms %= 1000 return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}" def clean_text(t: str) -> str: t = (t or "").replace("\n", " ").strip() low = t.lower() if low in {"[music]", "[musique]", "[applause]", "[silence]"}: return "" return t def extract_mc_number(title: str): m = re.search(r"\bMC\s*([0-9]{1,3})\b", title or "", re.IGNORECASE) return int(m.group(1)) if m else None def fetch_playlist_entries(url: str): """Retourne une liste d’entrées (id, title) de la playlist via yt-dlp (extract_flat).""" ydl_opts = {"quiet": True, "extract_flat": True, "skip_download": True} with YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) return info.get("entries", []) or [] # --- Récupération robuste du transcript (sans .get_transcript) --- def get_transcript_with_fallback(video_id: str): """ Récupère le transcript en tentant FR -> EN via find_transcript(), sinon prend le premier transcript disponible (auto ou autre). Retourne (segments, lang_code). """ try: transcripts = YouTubeTranscriptApi.list_transcripts(video_id) except (TranscriptsDisabled, NoTranscriptFound) as e: raise e # 1) Chercher FR puis EN for langs in LANG_PRIORITY: if langs == ["auto"]: continue try: t = transcripts.find_transcript(langs) # Transcript object segs = t.fetch() lang = getattr(t, "language_code", langs[0]) return segs, lang except (NoTranscriptFound, CouldNotRetrieveTranscript): pass # 2) Sinon, prendre le premier transcript récupérable (auto/any) for t in transcripts: try: segs = t.fetch() lang = getattr(t, "language_code", "auto") return segs, lang except CouldNotRetrieveTranscript: continue raise NoTranscriptFound("Aucun transcript récupérable") # --- Main --- def main(): entries = fetch_playlist_entries(PLAYLIST_URL) # Construire mapping MC -> (video_id, title) à partir des titres mapping = {} for e in entries: vid = e.get("id") or e.get("url") title = e.get("title", "") n = extract_mc_number(title) if n is not None and START_NUM <= n <= END_NUM: mapping[n] = (vid, title) write_header = not CSV_PATH.exists() with CSV_PATH.open("a", newline="", encoding="utf-8") as csvfile: w = csv.writer(csvfile) if write_header: w.writerow(["mc","video_id","url","lang","length_chars","status","title"]) for mc in range(START_NUM, END_NUM + 1): out = OUTPUT_DIR / f"MC{mc:02d}.txt" vid, title = mapping.get(mc, (None, None)) if not vid: # Fallback approximatif par index relatif si le titre ne contient pas “MCxx” idx = mc - START_NUM if 0 <= idx < len(entries): vid = entries[idx].get("id") or entries[idx].get("url") title = entries[idx].get("title", "") else: w.writerow([mc,"","","","", "no_video_match",""]) print(f"MC{mc:02d} -> pas de correspondance vidéo") continue url = f"https://www.youtube.com/watch?v={vid}" if out.exists(): w.writerow([mc, vid, url, "", out.stat().st_size, "exists", title or ""]) print(f"MC{mc:02d} -> déjà présent, skip") continue try: # Retries réseau simples last_err = None for attempt in range(3): try: segs, lang = get_transcript_with_fallback(vid) break except Exception as e: last_err = e if attempt == 2: raise time.sleep(1.0 + attempt*0.8 + random.random()*0.3) # Formatage lignes lines = [] for seg in segs: s = fmt_time(seg["start"]) txt = clean_text(seg["text"]) if txt: lines.append(f"{s} {txt}") if not lines: w.writerow([mc, vid, url, lang, 0, "empty_transcript", title or ""]) print(f"MC{mc:02d} -> transcript vide") continue with out.open("w", encoding="utf-8") as f: f.write("# source: youtube-transcript-api\n") f.write(f"# video: {url}\n\n") for line in lines: f.write(line + "\n") w.writerow([mc, vid, url, lang, sum(len(x) for x in lines), "ok", title or ""]) print(f"MC{mc:02d} -> OK ({len(lines)} lignes, lang={lang})") time.sleep(0.6 + random.random()*0.4) # politesse except (TranscriptsDisabled, NoTranscriptFound): w.writerow([mc, vid, url, "", "", "no_transcript", title or ""]) print(f"MC{mc:02d} -> pas de transcript") except KeyboardInterrupt: print("Interruption manuelle.") return except Exception as e: w.writerow([mc, vid, url, "", "", f"error:{type(e).__name__}", title or ""]) print(f"MC{mc:02d} -> erreur: {e}") print(f"\nTerminé. Dossier: {OUTPUT_DIR.resolve()} | Index: {CSV_PATH.resolve()}") if __name__ == "__main__": main()
Fatal error: Class 'Dispatcher' not found in /home/madeinm/soldatsdeplomb/index.php on line 28