site/encode.py

#!/usr/bin/env python3
import base64
import concurrent.futures
import multiprocessing
import os
import xml.etree.ElementTree as ET
from datetime import datetime
from subprocess import run
from urllib.parse import urlparse

from mutagen.flac import Picture
from mutagen.oggopus import OggOpus
from mutagen.oggvorbis import OggVorbis

import common


def encode_episode(podcast, episode, format):
    format, options = format

    infile = common.path_to_episode(episode["file_base"], "flac")
    outfile = common.path_to_episode(episode["file_base"], format)
    content_file = common.path_to_episode(episode["file_base"], "md")

    try:
        changed = any(
            os.path.getmtime(file) > os.path.getmtime(outfile)
            for file in [infile, content_file, podcast["poster"]]
        )
    except FileNotFoundError:
        changed = True

    if changed:
        tags = {
            "TITLE": episode["title"],
            "ARTIST": ", ".join(episode["contributors"]),
            "ALBUM": podcast["title"],
            "TRACK": episode["number"],
            "GENRE": "podcast",
            "DATE": datetime.strftime(episode["date"], "%Y-%m-%d"),
            "URL": podcast["link"],
            "COMMENT": episode["summary"],
        }

        command = ["ffmpeg", "-y", "-loglevel", "error"]
        command.extend(["-i", infile])
        if format not in ["oga", "opus"]:
            command.extend(["-i", podcast["poster"]])
        # For AAC, the default codec choice (ffmpeg native) is not the best choice
        if format == "m4a":
            command.extend(["-c:a", "libfdk_aac"])
        command.extend(["-c:v", "copy"])
        command.extend(["-bitexact"])  # deterministic output
        command.extend(options)
        for k, v in tags.items():
            command.extend(["-metadata", f"{k}={v}"])
        command.append(outfile)
        run(command, check=True)

        if format in ["oga", "opus"]:
            if format == "oga":
                audio = OggVorbis(outfile)
            else:
                audio = OggOpus(outfile)

            # poster
            picture = Picture()
            with open(podcast["poster"], "rb") as f:
                picture.data = f.read()
            picture.type = 17
            picture.desc = ""
            picture.mime = "image/jpeg"
            picture.width = 500
            picture.height = 500
            picture.depth = 24
            audio["metadata_block_picture"] = [
                base64.b64encode(picture.write()).decode("ascii")
            ]

            audio.save()

        print(f"[✔️] {episode['file_base']}.{format}")
    else:
        print(f"[⏭️] {episode['file_base']}.{format}")


os.makedirs("static/episodes", exist_ok=True)

tree = ET.parse("public/formats/opus/rss.xml")
root = tree.getroot()

channel = root.find("channel")

podcast = {
    "title": channel.find("title").text,
    "link": channel.find("link").text,
    "poster": "static" + urlparse(channel.find("image").find("url").text).path,
}

pool = concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count())

for item in channel.findall("item"):
    episode = {
        "title": item.find("title").text,
        "number": item.find("{http://www.itunes.com/dtds/podcast-1.0.dtd}episode").text,
        "date": datetime.strptime(
            item.find("pubDate").text, "%a, %d %b %Y %H:%M:%S %z"
        ),
        "contributors": [
            contributor.find("{http://www.w3.org/2005/Atom}name").text
            for contributor in item.findall("{http://www.w3.org/2005/Atom}contributor")
        ],
        "summary": item.find(
            "{http://www.itunes.com/dtds/podcast-1.0.dtd}summary"
        ).text,
        "file_base": os.path.splitext(
            os.path.basename(item.find("enclosure").attrib["url"])
        )[0],
    }

    for format in common.FORMATS.items():
        pool.submit(encode_episode, podcast, episode, format)

pool.shutdown(wait=True)
Initial commit 2020-08-11 16:14:38 +02:00			`#!/usr/bin/env python3`
			`import base64`
Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`import concurrent.futures`
			`import multiprocessing`
Initial commit 2020-08-11 16:14:38 +02:00			`import os`
			`import xml.etree.ElementTree as ET`
Reformat python scripts 2021-11-30 19:57:59 +01:00			`from datetime import datetime`
			`from subprocess import run`
			`from urllib.parse import urlparse`

			`from mutagen.flac import Picture`
			`from mutagen.oggopus import OggOpus`
			`from mutagen.oggvorbis import OggVorbis`
Initial commit 2020-08-11 16:14:38 +02:00
			`import common`


			`def encode_episode(podcast, episode, format):`
			`format, options = format`

Use flac instead of opus as source file format Using opus was a workaround when I still used drone for deployments. Since it now is deployed from my local machine, having FLACs in the repository is not that problematic. 2021-12-17 15:30:25 +01:00			`infile = common.path_to_episode(episode["file_base"], "flac")`
Initial commit 2020-08-11 16:14:38 +02:00			`outfile = common.path_to_episode(episode["file_base"], format)`
			`content_file = common.path_to_episode(episode["file_base"], "md")`

			`try:`
			`changed = any(`
			`os.path.getmtime(file) > os.path.getmtime(outfile)`
Remove per-episode poster feature We won’t use it. 2021-11-30 21:31:44 +01:00			`for file in [infile, content_file, podcast["poster"]]`
Initial commit 2020-08-11 16:14:38 +02:00			`)`
			`except FileNotFoundError:`
			`changed = True`

			`if changed:`
			`tags = {`
			`"TITLE": episode["title"],`
			`"ARTIST": ", ".join(episode["contributors"]),`
			`"ALBUM": podcast["title"],`
			`"TRACK": episode["number"],`
			`"GENRE": "podcast",`
			`"DATE": datetime.strftime(episode["date"], "%Y-%m-%d"),`
			`"URL": podcast["link"],`
			`"COMMENT": episode["summary"],`
			`}`

Build with nix This also switches from the hacky vendor.sh script to managing the dependencies with nix. 2021-11-30 19:17:20 +01:00			`command = ["ffmpeg", "-y", "-loglevel", "error"]`
Initial commit 2020-08-11 16:14:38 +02:00			`command.extend(["-i", infile])`
Reformat python scripts 2021-11-30 19:57:59 +01:00			`if format not in ["oga", "opus"]:`
Remove per-episode poster feature We won’t use it. 2021-11-30 21:31:44 +01:00			`command.extend(["-i", podcast["poster"]])`
encode.py: Force libfdk_aac For some reason, it is not the default. 2021-11-30 20:16:26 +01:00			`# For AAC, the default codec choice (ffmpeg native) is not the best choice`
			`if format == "m4a":`
			`command.extend(["-c:a", "libfdk_aac"])`
Initial commit 2020-08-11 16:14:38 +02:00			`command.extend(["-c:v", "copy"])`
Make encode bit-exact This ensures only files that were actually changed have to be uploaded on deploy. 2022-01-28 14:50:56 +01:00			`command.extend(["-bitexact"]) # deterministic output`
Initial commit 2020-08-11 16:14:38 +02:00			`command.extend(options)`
			`for k, v in tags.items():`
			`command.extend(["-metadata", f"{k}={v}"])`
			`command.append(outfile)`
Remove chapter feature We don’t have chapters 2021-11-30 19:39:46 +01:00			`run(command, check=True)`
Initial commit 2020-08-11 16:14:38 +02:00
			`if format in ["oga", "opus"]:`
			`if format == "oga":`
			`audio = OggVorbis(outfile)`
			`else:`
			`audio = OggOpus(outfile)`

			`# poster`
			`picture = Picture()`
Remove per-episode poster feature We won’t use it. 2021-11-30 21:31:44 +01:00			`with open(podcast["poster"], "rb") as f:`
Initial commit 2020-08-11 16:14:38 +02:00			`picture.data = f.read()`
			`picture.type = 17`
			`picture.desc = ""`
			`picture.mime = "image/jpeg"`
			`picture.width = 500`
			`picture.height = 500`
			`picture.depth = 24`
			`audio["metadata_block_picture"] = [`
			`base64.b64encode(picture.write()).decode("ascii")`
			`]`

			`audio.save()`

Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`print(f"[✔️] {episode['file_base']}.{format}")`
Initial commit 2020-08-11 16:14:38 +02:00			`else:`
Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`print(f"[⏭️] {episode['file_base']}.{format}")`
Initial commit 2020-08-11 16:14:38 +02:00

			`os.makedirs("static/episodes", exist_ok=True)`

			`tree = ET.parse("public/formats/opus/rss.xml")`
			`root = tree.getroot()`

			`channel = root.find("channel")`

			`podcast = {`
			`"title": channel.find("title").text,`
			`"link": channel.find("link").text,`
			`"poster": "static" + urlparse(channel.find("image").find("url").text).path,`
			`}`

Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`pool = concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count())`

Initial commit 2020-08-11 16:14:38 +02:00			`for item in channel.findall("item"):`
			`episode = {`
			`"title": item.find("title").text,`
			`"number": item.find("{http://www.itunes.com/dtds/podcast-1.0.dtd}episode").text,`
			`"date": datetime.strptime(`
			`item.find("pubDate").text, "%a, %d %b %Y %H:%M:%S %z"`
			`),`
			`"contributors": [`
			`contributor.find("{http://www.w3.org/2005/Atom}name").text`
			`for contributor in item.findall("{http://www.w3.org/2005/Atom}contributor")`
			`],`
			`"summary": item.find(`
			`"{http://www.itunes.com/dtds/podcast-1.0.dtd}summary"`
			`).text,`
			`"file_base": os.path.splitext(`
			`os.path.basename(item.find("enclosure").attrib["url"])`
			`)[0],`
			`}`

			`for format in common.FORMATS.items():`
Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`pool.submit(encode_episode, podcast, episode, format)`
Initial commit 2020-08-11 16:14:38 +02:00
Encode different episodes simultaneously This results in less nice log output, but if it’s faster then who cares? 2021-12-01 15:48:29 +01:00			`pool.shutdown(wait=True)`