Files
Sortarr/backend/sortarr/parser.py
2026-05-15 02:41:52 +00:00

142 lines
4.9 KiB
Python

from __future__ import annotations
import re
from pathlib import Path
QUALITY_RE = re.compile(r"\b(2160p|1080p|720p|480p|remux|bluray|web[- .]?dl|webrip|hdtv|dvdrip)\b", re.I)
YEAR_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
EPISODE_RE = re.compile(r"[Ss](\d{1,2})[ ._-]*[Ee](\d{1,3})(?:[ ._-]*[Ee](\d{1,3}))?")
ALT_EPISODE_RE = re.compile(r"\b(\d{1,2})x(\d{1,3})(?:[ ._-]*(\d{1,2})x(\d{1,3}))?\b")
SEASON_RE = re.compile(r"\b[Ss](?:eason)?[ ._-]*(\d{1,2})\b")
BRACKET_RE = re.compile(r"[\[(][^\])]*(?:\]|\))")
AUDIO_RE = re.compile(r"\b(?:aac|aac\d(?:[ ._-]?\d)?|ac3|eac3|ddp(?:\d(?:[ ._-]?\d)?)?|dts|truehd|atmos|flac|mp3|opus|5[ ._-]?1|7[ ._-]?1|2[ ._-]?0|6ch|2ch)\b", re.I)
CODEC_RE = re.compile(r"\b(?:x264|x265|h[ ._-]?264|h[ ._-]?265|hevc|avc|av1|10bit|8bit|hdr|hdr10|dv|dolby[ ._-]?vision)\b", re.I)
EDITION_RE = re.compile(r"\b(?:proper|repack|rerip|extended|unrated|directors?[ ._-]?cut|theatrical|imax|multi|line|dubbed|subbed)\b", re.I)
RELEASE_GROUP_RE = re.compile(r"(?:^|[ ._-])(?:YTS|TGx|EZTVx?|MeGusta|PSA|RARBG|NTb|AMZN|DSNP|PMNTP|FLUX|SuccessfulCrab|GalaxyTV)\b", re.I)
TRAILING_GROUP_RE = re.compile(r"(?:[ ._-]+-[ ._-]*[A-Za-z0-9][A-Za-z0-9._-]{1,24})$")
def spaced(raw: str) -> str:
text = raw.replace("&", " and ")
text = re.sub(r"[\._]+", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip(" -._")
def strip_brackets(raw: str) -> str:
return BRACKET_RE.sub(" ", raw)
def strip_release_tail(raw: str) -> str:
text = strip_brackets(raw)
text = TRAILING_GROUP_RE.sub("", text)
text = RELEASE_GROUP_RE.sub(" ", text)
return spaced(text)
def first_noise_index(text: str) -> int | None:
matches = [
match.start()
for pattern in (QUALITY_RE, AUDIO_RE, CODEC_RE, EDITION_RE, RELEASE_GROUP_RE)
for match in [pattern.search(text)]
if match
]
return min(matches) if matches else None
def trim_noise(raw: str) -> str:
text = strip_release_tail(raw)
idx = first_noise_index(text)
if idx is not None:
text = text[:idx]
return spaced(text)
def clean_title(raw: str) -> str:
text = trim_noise(raw)
text = YEAR_RE.sub(" ", text)
text = EPISODE_RE.sub(" ", text)
text = ALT_EPISODE_RE.sub(" ", text)
text = SEASON_RE.sub(" ", text)
return spaced(text) or "Unknown"
def clean_episode_title(raw: str) -> str:
text = trim_noise(raw)
text = YEAR_RE.sub(" ", text)
return spaced(text) or "Episode"
def parent_candidate(path: Path) -> str:
parent = path.parent
if parent.name.lower() in {"subs", "subtitles", "sub"}:
parent = parent.parent
name = parent.name
if not name or name in {".", "/"}:
return ""
return name
def movie_title_source(path: Path, stem: str) -> str:
parent = parent_candidate(path)
if YEAR_RE.search(parent):
return parent
if YEAR_RE.search(stem):
return stem
if parent and first_noise_index(parent) is None and not EPISODE_RE.search(parent):
return parent
return stem
def parse_media(path: str) -> dict:
p = Path(path)
stem = p.stem
quality_match = QUALITY_RE.search(stem) or QUALITY_RE.search(parent_candidate(p))
year_source = stem if YEAR_RE.search(stem) else parent_candidate(p)
year_match = YEAR_RE.search(year_source)
episode_match = EPISODE_RE.search(stem)
alt_match = ALT_EPISODE_RE.search(stem)
season_match = SEASON_RE.search(stem)
media_type = "movie"
season = None
episode = None
multi_episode = ""
episode_title = ""
if episode_match:
media_type = "episode"
season = int(episode_match.group(1))
episode = int(episode_match.group(2))
if episode_match.group(3):
multi_episode = f"-E{int(episode_match.group(3)):02d}"
title = clean_title(stem[:episode_match.start()])
episode_title = clean_episode_title(stem[episode_match.end():])
elif alt_match:
media_type = "episode"
season = int(alt_match.group(1))
episode = int(alt_match.group(2))
if alt_match.group(4):
multi_episode = f"-E{int(alt_match.group(4)):02d}"
title = clean_title(stem[:alt_match.start()])
episode_title = clean_episode_title(stem[alt_match.end():])
elif season_match:
media_type = "season"
season = int(season_match.group(1))
title = clean_title(stem[:season_match.start()] or parent_candidate(p) or stem)
else:
title = clean_title(movie_title_source(p, stem))
return {
"source": str(p),
"title": title,
"year": int(year_match.group(1)) if year_match else None,
"quality": f" - {quality_match.group(1).replace('.', ' ')}" if quality_match else "",
"type": media_type,
"season": season,
"episode": episode,
"multi_episode": multi_episode,
"episode_title": episode_title if media_type == "episode" else "",
"extension": p.suffix.lower(),
}