site/metadata.py

69 lines
2 KiB
Python
Raw Normal View History

2020-08-11 16:14:38 +02:00
#!/usr/bin/env python3
import json
import os.path
2021-11-30 19:57:59 +01:00
from glob import glob
from html.parser import HTMLParser
2020-08-11 16:14:38 +02:00
import common
class EpisodeHtmlParser(HTMLParser):
current_tag_is_episode_json = False
data = {}
def __init__(self, episode):
super().__init__()
self.episode = episode
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if (
tag == "script"
and attrs.get("type") == "application/json"
and attrs.get("id") == f"config-episode-{episode}"
):
self.current_tag_is_episode_json = True
def handle_endtag(self, tag):
if self.current_tag_is_episode_json:
self.current_tag_is_episode_json = False
def handle_data(self, data):
if self.current_tag_is_episode_json:
self.data = json.loads(data)
metadata = {}
for file in sorted(glob(f"content/{common.ACRONYM}*.md")):
episode = os.path.splitext(os.path.basename(file))[0]
metadata[episode] = {}
metadata[episode]["duration"] = common.sexagesimal(
float(common.get_episode_info(episode, "original")["duration"])
)
metadata[episode]["formats"] = {}
for format in common.FORMATS.keys():
try:
size = os.path.getsize(common.path_to_episode(episode, format))
except FileNotFoundError:
# when bootstrapping for the first time the encoded files do not exist
size = 0
metadata[episode]["formats"][format] = {"size": size}
with open("static/episodes.json", "w") as f:
f.write(json.dumps(metadata))
# extract podlove episode json
for file in sorted(glob(f"public/{common.ACRONYM}*/index.html")):
episode = os.path.basename(os.path.dirname(file))
parser = EpisodeHtmlParser(episode)
with open(file) as f:
parser.feed(f.read())
metadata = parser.data
os.makedirs("static/episodes", exist_ok=True)
2020-08-11 16:14:38 +02:00
with open(f"static/episodes/{episode}.podlove.json", "w") as f:
f.write(json.dumps(metadata))