Files
Sortarr/backend/sortarr/library.py

368 lines
14 KiB
Python

from __future__ import annotations
import os
import re
import time
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from .metadata import movie_metadata, series_metadata
from .parser import clean_title, parse_media
from .storage import drive_stats
LIBRARY_ROOT_NAMES = {"movies", "shows", "tv", "tv shows"}
TV_ROOT_NAMES = {"shows", "tv", "tv shows"}
EPISODE_RE = re.compile(r"[Ss](\d{1,2})[ ._-]*[Ee](\d{1,3})")
SEASON_FOLDER_RE = re.compile(r"season[ ._-]*(\d{1,2})", re.I)
YEAR_RE = re.compile(r"\((19\d{2}|20\d{2})\)")
ANY_YEAR_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
VERSION_RE = re.compile(r"\b(2160p|1080p|720p|480p|remux|bluray|web[- .]?dl|webrip|hdtv|dvdrip|x264|x265|h[ ._-]?264|h[ ._-]?265|hevc|av1|hdr10?|dv|proper|repack|extended|unrated|directors?[ ._-]?cut|theatrical|imax)\b", re.I)
EXTRA_FOLDER_NAMES = {
"behind the scenes",
"deleted scenes",
"extras",
"featurettes",
"interviews",
"samples",
"scenes",
"shorts",
"trailers",
}
def library_roots(root: Path) -> list[Path]:
matches = []
try:
children = list(root.iterdir())
except OSError:
return matches
for child in children:
if child.is_dir() and child.name.lower() in LIBRARY_ROOT_NAMES:
matches.append(child)
return matches
def library_kind(library_root: Path) -> str:
return "tv" if library_root.name.lower() in TV_ROOT_NAMES else "movie"
def infer_library_kind(path: str) -> str:
parts = {part.lower() for part in Path(path).parts}
if parts & TV_ROOT_NAMES:
return "tv"
if "movies" in parts:
return "movie"
return "other"
def split_library_path(path: str) -> tuple[str, list[str]]:
parts = list(Path(path).parts)
lowered = [part.lower() for part in parts]
for root in LIBRARY_ROOT_NAMES:
if root in lowered:
idx = lowered.index(root)
return parts[idx], parts[idx + 1:]
return "", parts
def identity_slug(title: str) -> str:
return re.sub(r"[^a-z0-9]+", " ", title.lower()).strip()
def clean_collection_title(name: str) -> tuple[str, int | None]:
year_match = ANY_YEAR_RE.search(name)
year = int(year_match.group(1)) if year_match else None
title = clean_title(name)
return title, year
def merge_key(kind: str, title: str, year: int | None = None) -> str:
slug = identity_slug(title)
if kind == "movie":
return f"movie::{slug}::{year or ''}"
return f"tv::{slug}"
def file_version(item: dict) -> dict:
path = Path(item.get("path", ""))
text = " ".join(part for part in [path.parent.name, path.stem] if part)
tags = []
for match in VERSION_RE.finditer(text):
tag = match.group(1).replace(".", " ").replace("_", " ")
normalized = re.sub(r"\s+", " ", tag).strip()
if normalized.lower() not in {existing.lower() for existing in tags}:
tags.append(normalized)
return {
"path": item.get("path"),
"name": item.get("name"),
"drive": item.get("drive"),
"size": item.get("size") or 0,
"quality": next((tag for tag in tags if tag.lower() in {"2160p", "1080p", "720p", "480p"}), ""),
"tags": tags[:8],
}
def is_extra_media(path: Path, library_root: Path, kind: str, app: dict) -> bool:
try:
relative = path.relative_to(library_root)
except ValueError:
relative = path
parts = [part.lower().replace("_", " ").replace(".", " ") for part in relative.parts[:-1]]
if kind == "movie" and any(part in EXTRA_FOLDER_NAMES for part in parts[1:]):
return True
lowered_name = path.name.lower().replace("_", " ").replace(".", " ")
return any(keyword and keyword.lower() in lowered_name for keyword in app.get("extra_keywords", []))
def item_identity(item: dict) -> dict:
root, rel = split_library_path(item.get("path", ""))
kind = item.get("library") or infer_library_kind(item.get("path", ""))
parsed = parse_media(item.get("path", item.get("name", "")))
if kind == "tv" and rel:
title = clean_title(rel[0])
season = parsed.get("season")
episode = parsed.get("episode")
for part in rel:
match = SEASON_FOLDER_RE.search(part)
if match and not season:
season = int(match.group(1))
return {
"kind": "tv",
"root": root,
"title": title,
"key": merge_key("tv", title),
"season": season,
"episode": episode,
}
title, year = clean_collection_title(rel[0] if rel else parsed["title"])
year = year or parsed.get("year")
return {
"kind": "movie",
"root": root,
"title": title,
"year": year,
"slug": identity_slug(title),
"key": merge_key("movie", title, year),
}
def normalize_library(library: dict) -> dict:
items = library.get("items", [])
kinds = Counter()
for item in items:
kind = item.get("library") or infer_library_kind(item.get("path", ""))
item["library"] = kind
if kind in {"movie", "tv"}:
kinds[kind] += 1
library["counts"] = {
"movies": kinds.get("movie", 0),
"tv": kinds.get("tv", 0),
"total": len(items),
}
if "collections" not in library:
library["collections"] = build_collections({}, items)
return library
def build_collections(config: dict, items: list[dict], enrich: bool = False) -> dict:
movies: dict[str, dict] = {}
series: dict[str, dict] = {}
for item in items:
identity = item_identity(item)
if identity["kind"] == "tv":
show = series.setdefault(identity["key"], {
"key": identity["key"],
"title": identity["title"],
"library": "tv",
"files": [],
"seasons": {},
"metadata": {"title": identity["title"], "source": "filename", "seasons": {}},
})
show["files"].append(item)
season_no = identity.get("season") or 0
episode_no = identity.get("episode") or 0
season = show["seasons"].setdefault(str(season_no), {"season": season_no, "episodes": {}})
episode = season["episodes"].setdefault(str(episode_no), {
"season": season_no,
"episode": episode_no,
"title": f"S{season_no:02d}E{episode_no:02d}" if season_no and episode_no else item["name"],
"files": [],
"status": "present",
})
episode["files"].append(item)
else:
key = identity["key"]
if not identity.get("year"):
existing_key = next((candidate_key for candidate_key, candidate in movies.items() if candidate.get("slug") == identity["slug"]), None)
if existing_key:
key = existing_key
elif key not in movies:
no_year_key = merge_key("movie", identity["title"], None)
if no_year_key in movies:
movies[key] = movies.pop(no_year_key)
movies[key]["key"] = key
movie = movies.setdefault(key, {
"key": key,
"title": identity["title"],
"year": identity.get("year"),
"slug": identity.get("slug"),
"library": "movie",
"files": [],
"versions": [],
"metadata": {"title": identity["title"], "source": "filename"},
})
movie["files"].append(item)
movie["versions"].append(file_version(item))
if not movie.get("year") and identity.get("year"):
movie["year"] = identity.get("year")
if enrich and config:
workers = int(config.get("app", {}).get("metadata_parallelism", 8))
tasks = {}
with ThreadPoolExecutor(max_workers=max(1, min(workers, 12))) as executor:
for movie in movies.values():
future = executor.submit(movie_metadata, config, movie["title"], movie.get("year"))
tasks[future] = movie
for show in series.values():
present_seasons = {int(season) for season in show["seasons"] if int(season) > 0}
future = executor.submit(series_metadata, config, show["title"], present_seasons)
tasks[future] = show
for future in as_completed(tasks):
try:
tasks[future]["metadata"] = future.result()
except Exception:
pass
today = time.strftime("%Y-%m-%d")
for show in series.values():
for season_no, season_meta in show.get("metadata", {}).get("seasons", {}).items():
season = show["seasons"].setdefault(season_no, {"season": int(season_no), "episodes": {}})
for meta_episode in season_meta.get("episodes", []):
key = str(meta_episode.get("episode") or 0)
existing = season["episodes"].get(key)
if existing:
existing.update({
"title": meta_episode.get("title") or existing["title"],
"air_date": meta_episode.get("air_date"),
"overview": meta_episode.get("overview"),
"still": meta_episode.get("still"),
})
else:
air_date = meta_episode.get("air_date")
season["episodes"][key] = {
**meta_episode,
"files": [],
"status": "upcoming" if air_date and air_date > today else "missing",
}
for season in show["seasons"].values():
season["episodes"] = sorted(season["episodes"].values(), key=lambda ep: ep.get("episode") or 0)
show["seasons"] = sorted(show["seasons"].values(), key=lambda season: season["season"])
return {
"movies": sorted(movies.values(), key=lambda movie: movie["title"].lower()),
"series": sorted(series.values(), key=lambda show: show["title"].lower()),
}
def preserve_metadata(collections: dict, previous_library: dict | None) -> dict:
previous = (previous_library or {}).get("collections") or {}
previous_by_key = {
item.get("key"): item
for group in ("movies", "series")
for item in previous.get(group, [])
if item.get("key")
}
for group in ("movies", "series"):
for item in collections.get(group, []):
old = previous_by_key.get(item.get("key"))
old_meta = (old or {}).get("metadata") or {}
if old_meta.get("source") == "tmdb":
item["metadata"] = old_meta
if old_meta.get("manual"):
item["title"] = old_meta.get("title") or item.get("title")
if item.get("library") == "movie" and old_meta.get("release_date"):
item["year"] = int(old_meta["release_date"][:4])
return collections
def library_snapshot(config: dict, previous_library: dict | None = None) -> dict:
items = []
extensions = Counter()
ignored_dirs = {"$RECYCLE.BIN", "System Volume Information", ".Trash-1000"}
app = config["app"]
max_files = int(app.get("library_scan_max_files", 20000))
deadline = time.monotonic() + int(app.get("library_scan_timeout_seconds", 8))
scanned = 0
truncated = False
for drive in config.get("drives", []):
if scanned >= max_files or time.monotonic() >= deadline:
truncated = True
break
root = Path(drive["path"])
if not root.exists():
continue
for library_root in library_roots(root):
kind = library_kind(library_root)
for current, dirs, files in os.walk(library_root, onerror=lambda error: None):
if scanned >= max_files or time.monotonic() >= deadline:
truncated = True
break
dirs[:] = [name for name in dirs if name not in ignored_dirs]
lower_files = {name.lower() for name in files}
for filename in files:
if scanned >= max_files or time.monotonic() >= deadline:
truncated = True
break
path = Path(current) / filename
try:
stat = path.stat()
except OSError:
continue
scanned += 1
extensions[path.suffix.lower() or "none"] += 1
if path.suffix.lower() in app.get("media_extensions", []):
if is_extra_media(path, library_root, kind, app):
continue
subtitle_names = [
f"{path.stem}{ext}".lower()
for ext in app.get("subtitle_extensions", [])
]
items.append({
"path": str(path),
"name": path.name,
"drive": drive["id"],
"library": kind,
"root": library_root.name,
"size": stat.st_size,
"modified": stat.st_mtime,
"has_subtitles": any(name in lower_files for name in subtitle_names),
})
enrich_limit = int(app.get("library_metadata_enrich_max_items", 500))
should_enrich = bool(config.get("metadata", {}).get("tmdb_enabled", True)) and len(items) <= enrich_limit
collections = build_collections(config, items, enrich=should_enrich)
if not should_enrich:
collections = preserve_metadata(collections, previous_library)
return normalize_library({
"drives": drive_stats(config),
"items": sorted(items, key=lambda item: item["modified"], reverse=True),
"collections": collections,
"extensions": dict(extensions.most_common()),
"scanned_files": scanned,
"truncated": truncated,
"metadata_enriched": should_enrich,
"identifications": (previous_library or {}).get("identifications", {}),
})
def enrich_library_metadata(config: dict, library: dict) -> dict:
items = library.get("items") or []
enriched = {
**library,
"collections": build_collections(config, items, enrich=True),
"metadata_enriched": True,
"metadata_refreshed_at": time.time(),
}
return normalize_library(enriched)