# -*- coding: utf-8 -*-
# Dépendances:
# py -3.13 -m pip install -U yt-dlp youtube-transcript-api
import re, csv, time, random
from pathlib import Path
from yt_dlp import YoutubeDL
from youtube_transcript_api import (
YouTubeTranscriptApi,
TranscriptsDisabled,
NoTranscriptFound,
CouldNotRetrieveTranscript,
)
# --- Paramètres ---
PLAYLIST_URL = "https://www.youtube.com/playlist?list=PLKu5WmfJIJu8ruwTWc7ibegysYYqYby1t"
START_NUM = 68
END_NUM = 159
OUTPUT_DIR = Path("transcripts")
OUTPUT_DIR.mkdir(exist_ok=True)
CSV_PATH = OUTPUT_DIR / "index.csv"
# Priorités de langues (FR -> EN -> auto/any)
LANG_PRIORITY = [["fr", "fr-FR"], ["en", "en-US"], ["auto"]]
# --- Utils ---
def fmt_time(seconds: float) -> str:
"""Format HH:MM:SS.mmm à partir de secondes (float)."""
ms = int(round(seconds * 1000))
h = ms // 3_600_000; ms %= 3_600_000
m = ms // 60_000; ms %= 60_000
s = ms // 1000; ms %= 1000
return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
def clean_text(t: str) -> str:
t = (t or "").replace("\n", " ").strip()
low = t.lower()
if low in {"[music]", "[musique]", "[applause]", "[silence]"}:
return ""
return t
def extract_mc_number(title: str):
m = re.search(r"\bMC\s*([0-9]{1,3})\b", title or "", re.IGNORECASE)
return int(m.group(1)) if m else None
def fetch_playlist_entries(url: str):
"""Retourne une liste d’entrées (id, title) de la playlist via yt-dlp (extract_flat)."""
ydl_opts = {"quiet": True, "extract_flat": True, "skip_download": True}
with YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
return info.get("entries", []) or []
# --- Récupération robuste du transcript (sans .get_transcript) ---
def get_transcript_with_fallback(video_id: str):
"""
Récupère le transcript en tentant FR -> EN via find_transcript(), sinon prend
le premier transcript disponible (auto ou autre). Retourne (segments, lang_code).
"""
try:
transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
except (TranscriptsDisabled, NoTranscriptFound) as e:
raise e
# 1) Chercher FR puis EN
for langs in LANG_PRIORITY:
if langs == ["auto"]:
continue
try:
t = transcripts.find_transcript(langs) # Transcript object
segs = t.fetch()
lang = getattr(t, "language_code", langs[0])
return segs, lang
except (NoTranscriptFound, CouldNotRetrieveTranscript):
pass
# 2) Sinon, prendre le premier transcript récupérable (auto/any)
for t in transcripts:
try:
segs = t.fetch()
lang = getattr(t, "language_code", "auto")
return segs, lang
except CouldNotRetrieveTranscript:
continue
raise NoTranscriptFound("Aucun transcript récupérable")
# --- Main ---
def main():
entries = fetch_playlist_entries(PLAYLIST_URL)
# Construire mapping MC -> (video_id, title) à partir des titres
mapping = {}
for e in entries:
vid = e.get("id") or e.get("url")
title = e.get("title", "")
n = extract_mc_number(title)
if n is not None and START_NUM <= n <= END_NUM:
mapping[n] = (vid, title)
write_header = not CSV_PATH.exists()
with CSV_PATH.open("a", newline="", encoding="utf-8") as csvfile:
w = csv.writer(csvfile)
if write_header:
w.writerow(["mc","video_id","url","lang","length_chars","status","title"])
for mc in range(START_NUM, END_NUM + 1):
out = OUTPUT_DIR / f"MC{mc:02d}.txt"
vid, title = mapping.get(mc, (None, None))
if not vid:
# Fallback approximatif par index relatif si le titre ne contient pas “MCxx”
idx = mc - START_NUM
if 0 <= idx < len(entries):
vid = entries[idx].get("id") or entries[idx].get("url")
title = entries[idx].get("title", "")
else:
w.writerow([mc,"","","","", "no_video_match",""])
print(f"MC{mc:02d} -> pas de correspondance vidéo")
continue
url = f"https://www.youtube.com/watch?v={vid}"
if out.exists():
w.writerow([mc, vid, url, "", out.stat().st_size, "exists", title or ""])
print(f"MC{mc:02d} -> déjà présent, skip")
continue
try:
# Retries réseau simples
last_err = None
for attempt in range(3):
try:
segs, lang = get_transcript_with_fallback(vid)
break
except Exception as e:
last_err = e
if attempt == 2:
raise
time.sleep(1.0 + attempt*0.8 + random.random()*0.3)
# Formatage lignes
lines = []
for seg in segs:
s = fmt_time(seg["start"])
txt = clean_text(seg["text"])
if txt:
lines.append(f"{s} {txt}")
if not lines:
w.writerow([mc, vid, url, lang, 0, "empty_transcript", title or ""])
print(f"MC{mc:02d} -> transcript vide")
continue
with out.open("w", encoding="utf-8") as f:
f.write("# source: youtube-transcript-api\n")
f.write(f"# video: {url}\n\n")
for line in lines:
f.write(line + "\n")
w.writerow([mc, vid, url, lang, sum(len(x) for x in lines), "ok", title or ""])
print(f"MC{mc:02d} -> OK ({len(lines)} lignes, lang={lang})")
time.sleep(0.6 + random.random()*0.4) # politesse
except (TranscriptsDisabled, NoTranscriptFound):
w.writerow([mc, vid, url, "", "", "no_transcript", title or ""])
print(f"MC{mc:02d} -> pas de transcript")
except KeyboardInterrupt:
print("Interruption manuelle.")
return
except Exception as e:
w.writerow([mc, vid, url, "", "", f"error:{type(e).__name__}", title or ""])
print(f"MC{mc:02d} -> erreur: {e}")
print(f"\nTerminé. Dossier: {OUTPUT_DIR.resolve()} | Index: {CSV_PATH.resolve()}")
if __name__ == "__main__":
main()
Fatal error: Class 'Dispatcher' not found in /home/madeinm/soldatsdeplomb/index.php on line 28