Initial commit
This commit is contained in:
141
backend/sortarr/parser.py
Normal file
141
backend/sortarr/parser.py
Normal file
@@ -0,0 +1,141 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
QUALITY_RE = re.compile(r"\b(2160p|1080p|720p|480p|remux|bluray|web[- .]?dl|webrip|hdtv|dvdrip)\b", re.I)
|
||||
YEAR_RE = re.compile(r"\b(19\d{2}|20\d{2})\b")
|
||||
EPISODE_RE = re.compile(r"[Ss](\d{1,2})[ ._-]*[Ee](\d{1,3})(?:[ ._-]*[Ee](\d{1,3}))?")
|
||||
ALT_EPISODE_RE = re.compile(r"\b(\d{1,2})x(\d{1,3})(?:[ ._-]*(\d{1,2})x(\d{1,3}))?\b")
|
||||
SEASON_RE = re.compile(r"\b[Ss](?:eason)?[ ._-]*(\d{1,2})\b")
|
||||
BRACKET_RE = re.compile(r"[\[(][^\])]*(?:\]|\))")
|
||||
AUDIO_RE = re.compile(r"\b(?:aac|aac\d(?:[ ._-]?\d)?|ac3|eac3|ddp(?:\d(?:[ ._-]?\d)?)?|dts|truehd|atmos|flac|mp3|opus|5[ ._-]?1|7[ ._-]?1|2[ ._-]?0|6ch|2ch)\b", re.I)
|
||||
CODEC_RE = re.compile(r"\b(?:x264|x265|h[ ._-]?264|h[ ._-]?265|hevc|avc|av1|10bit|8bit|hdr|hdr10|dv|dolby[ ._-]?vision)\b", re.I)
|
||||
EDITION_RE = re.compile(r"\b(?:proper|repack|rerip|extended|unrated|directors?[ ._-]?cut|theatrical|imax|multi|line|dubbed|subbed)\b", re.I)
|
||||
RELEASE_GROUP_RE = re.compile(r"(?:^|[ ._-])(?:YTS|TGx|EZTVx?|MeGusta|PSA|RARBG|NTb|AMZN|DSNP|PMNTP|FLUX|SuccessfulCrab|GalaxyTV)\b", re.I)
|
||||
TRAILING_GROUP_RE = re.compile(r"(?:[ ._-]+-[ ._-]*[A-Za-z0-9][A-Za-z0-9._-]{1,24})$")
|
||||
|
||||
|
||||
def spaced(raw: str) -> str:
|
||||
text = raw.replace("&", " and ")
|
||||
text = re.sub(r"[\._]+", " ", text)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip(" -._")
|
||||
|
||||
|
||||
def strip_brackets(raw: str) -> str:
|
||||
return BRACKET_RE.sub(" ", raw)
|
||||
|
||||
|
||||
def strip_release_tail(raw: str) -> str:
|
||||
text = strip_brackets(raw)
|
||||
text = TRAILING_GROUP_RE.sub("", text)
|
||||
text = RELEASE_GROUP_RE.sub(" ", text)
|
||||
return spaced(text)
|
||||
|
||||
|
||||
def first_noise_index(text: str) -> int | None:
|
||||
matches = [
|
||||
match.start()
|
||||
for pattern in (QUALITY_RE, AUDIO_RE, CODEC_RE, EDITION_RE, RELEASE_GROUP_RE)
|
||||
for match in [pattern.search(text)]
|
||||
if match
|
||||
]
|
||||
return min(matches) if matches else None
|
||||
|
||||
|
||||
def trim_noise(raw: str) -> str:
|
||||
text = strip_release_tail(raw)
|
||||
idx = first_noise_index(text)
|
||||
if idx is not None:
|
||||
text = text[:idx]
|
||||
return spaced(text)
|
||||
|
||||
|
||||
def clean_title(raw: str) -> str:
|
||||
text = trim_noise(raw)
|
||||
text = YEAR_RE.sub(" ", text)
|
||||
text = EPISODE_RE.sub(" ", text)
|
||||
text = ALT_EPISODE_RE.sub(" ", text)
|
||||
text = SEASON_RE.sub(" ", text)
|
||||
return spaced(text) or "Unknown"
|
||||
|
||||
|
||||
def clean_episode_title(raw: str) -> str:
|
||||
text = trim_noise(raw)
|
||||
text = YEAR_RE.sub(" ", text)
|
||||
return spaced(text) or "Episode"
|
||||
|
||||
|
||||
def parent_candidate(path: Path) -> str:
|
||||
parent = path.parent
|
||||
if parent.name.lower() in {"subs", "subtitles", "sub"}:
|
||||
parent = parent.parent
|
||||
name = parent.name
|
||||
if not name or name in {".", "/"}:
|
||||
return ""
|
||||
return name
|
||||
|
||||
|
||||
def movie_title_source(path: Path, stem: str) -> str:
|
||||
parent = parent_candidate(path)
|
||||
if YEAR_RE.search(parent):
|
||||
return parent
|
||||
if YEAR_RE.search(stem):
|
||||
return stem
|
||||
if parent and first_noise_index(parent) is None and not EPISODE_RE.search(parent):
|
||||
return parent
|
||||
return stem
|
||||
|
||||
|
||||
def parse_media(path: str) -> dict:
|
||||
p = Path(path)
|
||||
stem = p.stem
|
||||
quality_match = QUALITY_RE.search(stem) or QUALITY_RE.search(parent_candidate(p))
|
||||
year_source = stem if YEAR_RE.search(stem) else parent_candidate(p)
|
||||
year_match = YEAR_RE.search(year_source)
|
||||
episode_match = EPISODE_RE.search(stem)
|
||||
alt_match = ALT_EPISODE_RE.search(stem)
|
||||
season_match = SEASON_RE.search(stem)
|
||||
|
||||
media_type = "movie"
|
||||
season = None
|
||||
episode = None
|
||||
multi_episode = ""
|
||||
episode_title = ""
|
||||
|
||||
if episode_match:
|
||||
media_type = "episode"
|
||||
season = int(episode_match.group(1))
|
||||
episode = int(episode_match.group(2))
|
||||
if episode_match.group(3):
|
||||
multi_episode = f"-E{int(episode_match.group(3)):02d}"
|
||||
title = clean_title(stem[:episode_match.start()])
|
||||
episode_title = clean_episode_title(stem[episode_match.end():])
|
||||
elif alt_match:
|
||||
media_type = "episode"
|
||||
season = int(alt_match.group(1))
|
||||
episode = int(alt_match.group(2))
|
||||
if alt_match.group(4):
|
||||
multi_episode = f"-E{int(alt_match.group(4)):02d}"
|
||||
title = clean_title(stem[:alt_match.start()])
|
||||
episode_title = clean_episode_title(stem[alt_match.end():])
|
||||
elif season_match:
|
||||
media_type = "season"
|
||||
season = int(season_match.group(1))
|
||||
title = clean_title(stem[:season_match.start()] or parent_candidate(p) or stem)
|
||||
else:
|
||||
title = clean_title(movie_title_source(p, stem))
|
||||
|
||||
return {
|
||||
"source": str(p),
|
||||
"title": title,
|
||||
"year": int(year_match.group(1)) if year_match else None,
|
||||
"quality": f" - {quality_match.group(1).replace('.', ' ')}" if quality_match else "",
|
||||
"type": media_type,
|
||||
"season": season,
|
||||
"episode": episode,
|
||||
"multi_episode": multi_episode,
|
||||
"episode_title": episode_title if media_type == "episode" else "",
|
||||
"extension": p.suffix.lower(),
|
||||
}
|
||||
Reference in New Issue
Block a user