site/metadata.py

#!/usr/bin/env python3
import json
import os.path
from glob import glob
from html.parser import HTMLParser

import common


class EpisodeHtmlParser(HTMLParser):
    current_tag_is_episode_json = False
    data = {}

    def __init__(self, episode):
        super().__init__()
        self.episode = episode

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if (
            tag == "script"
            and attrs.get("type") == "application/json"
            and attrs.get("id") == f"config-episode-{episode}"
        ):
            self.current_tag_is_episode_json = True

    def handle_endtag(self, tag):
        if self.current_tag_is_episode_json:
            self.current_tag_is_episode_json = False

    def handle_data(self, data):
        if self.current_tag_is_episode_json:
            self.data = json.loads(data)


metadata = {}

for file in sorted(glob(f"content/{common.ACRONYM}*.md")):
    episode = os.path.splitext(os.path.basename(file))[0]
    metadata[episode] = {}

    metadata[episode]["duration"] = common.sexagesimal(
        float(common.get_episode_info(episode, "flac")["duration"])
    )

    metadata[episode]["formats"] = {}
    for format in common.FORMATS.keys():
        try:
            size = os.path.getsize(common.path_to_episode(episode, format))
        except FileNotFoundError:
            # when bootstrapping for the first time the encoded files do not exist
            size = 0
        metadata[episode]["formats"][format] = {"size": size}

with open("static/episodes.json", "w") as f:
    f.write(json.dumps(metadata))

# extract podlove episode json
for file in sorted(glob(f"public/{common.ACRONYM}*/index.html")):
    episode = os.path.basename(os.path.dirname(file))
    parser = EpisodeHtmlParser(episode)
    with open(file) as f:
        parser.feed(f.read())
    metadata = parser.data

    os.makedirs("static/episodes", exist_ok=True)
    with open(f"static/episodes/{episode}.podlove.json", "w") as f:
        f.write(json.dumps(metadata))