audio_wave.py

#

This is free and unencumbered software released into the public domain. For more detail, see the LICENCE file at https://github.com/adefossez/seewav/blob/master/LICENSE Original author: adefossez

Generates a nice waveform visualization from an audio file, save it as a mp4 file.

import argparse
import json
import math
import subprocess as sp
import sys
import tempfile
from pathlib import Path

import cairo
import numpy as np
import PIL.Image as Image
import tqdm

#

Wrap text with ANSI color code. See https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences

def colorize(text, color):

#

    code = f"\033[{color}m"
    restore = "\033[0m"
    return "".join([code, text, restore])

#

Something bad happened. Display an error message and abort.

def fatal(msg):

#

    head = "error: "
    if sys.stderr.isatty():
        head = colorize("error: ", 1)
    print(head + str(msg), file=sys.stderr)
    sys.exit(1)

#

Return some info on the media file.

def read_info(media):

#

    proc = sp.run(
        ["ffprobe", "-loglevel", "panic", str(media), "-print_format", "json", "-show_format", "-show_streams"],
        capture_output=True,
    )
    if proc.returncode:
        raise OSError(f"{media} does not exist or is of a wrong type.")
    return json.loads(proc.stdout.decode("utf-8"))

#

Read the audio file, starting at seek (or 0) seconds for duration (or all) seconds. Returns float[channels, samples].

def read_audio(audio, seek=None, duration=None):

#

    info = read_info(audio)
    channels = None
    stream = info["streams"][0]
    if stream["codec_type"] != "audio":
        raise ValueError(f"{audio} should contain only audio.")
    channels = stream["channels"]
    samplerate = float(stream["sample_rate"])

#

Good old ffmpeg

    command = ["ffmpeg", "-y"]
    command += ["-loglevel", "panic"]
    if seek is not None:
        command += ["-ss", str(seek)]
    command += ["-i", audio]
    if duration is not None:
        command += ["-t", str(duration)]
    command += ["-f", "f32le"]
    command += ["-"]

    proc = sp.run(command, check=True, capture_output=True)
    wav = np.frombuffer(proc.stdout, dtype=np.float32)
    return wav.reshape(-1, channels).T, samplerate

#

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#

Extract the envelope of the waveform wav (float[samples]), using average pooling with window samples and the given stride.

def envelope(wav, window, stride):

#

pos = np.pad(np.maximum(wav, 0), window // 2)

    wav = np.pad(wav, window // 2)
    out = []
    for off in range(0, len(wav) - window, stride):
        frame = wav[off : off + window]
        out.append(np.maximum(frame, 0).mean())
    out = np.array(out)

#

Some form of audio compressor based on the sigmoid.

    out = 1.9 * (sigmoid(2.5 * out) - 0.5)
    return out

#

Internal function, create cairo surface from Pillow image

def pil_to_surface(image):

#

    if "A" not in image.getbands():
        image.putalpha(int(256))
    return cairo.ImageSurface.create_for_data(
        bytearray(image.tobytes("raw", "BGRa")), cairo.FORMAT_ARGB32, image.width, image.height
    )

#

Internal function, draw a single frame (two frames for stereo) using cairo and save it to the out file as png. envs is a list of envelopes over channels, each env is a float[bars] representing the height of the envelope to draw. Each entry will be represented by a bar.

def draw_env(envs, out, fg_colors, fg_opacity, bg_color, bg_image, center, size):

#

    if bg_image is None:
        surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, *size)
        offset = [0, 0]
    else:
        surface = pil_to_surface(bg_image)

#

offset needs to be relative to the size of the surface, not the size of the background image

        offset = [
            (bg_image.width * center[0] - size[0] / 2) / size[0],
            (bg_image.height * center[1] - size[1] / 2) / size[1],
        ]
    ctx = cairo.Context(surface)
    ctx.scale(*size)
    if bg_image is None:
        ctx.set_source_rgb(*bg_color)
        ctx.rectangle(0, 0, 1, 1)
        ctx.fill()

    K = len(envs)  # Number of waves to draw (waves are stacked vertically)
    T = len(envs[0])  # Numbert of time steps
    pad_ratio = 0.1  # spacing ratio between 2 bars
    width = 1.0 / (T * (1 + 2 * pad_ratio))
    pad = pad_ratio * width
    delta = 2 * pad + width

    ctx.translate(*offset)
    ctx.set_line_width(width)
    for step in range(T):
        for i in range(K):
            half = 0.5 * envs[i][step]  # (semi-)height of the bar
            half /= K  # as we stack K waves vertically
            midrule = (1 + 2 * i) / (2 * K)  # midrule of i-th wave
            ctx.set_source_rgba(*fg_colors[i], fg_opacity)
            ctx.move_to(pad + step * delta, midrule - half)
            ctx.line_to(pad + step * delta, midrule)
            ctx.stroke()
            ctx.set_source_rgba(*fg_colors[i], fg_opacity - fg_opacity / 5)
            ctx.move_to(pad + step * delta, midrule)
            ctx.line_to(pad + step * delta, midrule + 0.9 * half)
            ctx.stroke()

    surface.write_to_png(out)

#

def interpole(x1, y1, x2, y2, x):
    return y1 + (y2 - y1) * (x - x1) / (x2 - x1)

#

Generate the visualisation for the audio file, using a tmp folder and saving the final video in out. seek and durations gives the extract location if any. rate is the framerate of the output video.

bars is the number of bars in the animation. speed is the base speed of transition. Depending on volume, actual speed will vary between 0.5 and 2 times it. time amount of audio shown at once on a frame. oversample higher values will lead to more frequent changes. fg_color is the rgb color to use for the foreground. fg_color2 is the rgb color to use for the second wav if stereo is set. bg_color is the rgb color to use for the background. bg_image is the path to the PNG image to use for the background. size is the (width, height) in pixels to generate. stereo is whether to create 2 waves.

def visualize(
    audio,
    tmp,
    out,
    seek=None,
    duration=None,
    rate=60,
    bars=50,
    speed=4,
    time=0.4,
    oversample=3,
    fg_color=(0.2, 0.2, 0.2),
    fg_color2=(0.5, 0.3, 0.6),
    fg_opacity=1,
    bg_color=(1, 1, 1),
    bg_image=None,
    center=(0.5, 0.5),
    size=(400, 300),
    stereo=False,
):

#

    try:
        wav, sr = read_audio(audio, seek=seek, duration=duration)
    except (OSError, ValueError) as err:
        fatal(err)
        raise

    output_size = size
    image = None
    if bg_image is not None:
        try:
            image = Image.open(bg_image)
        except (OSError, ValueError) as err:
            fatal(err)
            raise

#

resize image to be compatible with ffmpeg

        if image.width % 2 == 1:
            image = image.resize((image.width + 1, image.height))
        if image.height % 2 == 1:
            image = image.resize((image.width, image.height + 1))
        output_size = image.width, image.height

#

wavs is a list of wav over channels

    wavs = []
    if stereo:
        assert wav.shape[0] == 2, "stereo requires stereo audio file"
        wavs.append(wav[0])
        wavs.append(wav[1])
    else:
        wav = wav.mean(0)
        wavs.append(wav)

    for i, wav in enumerate(wavs):
        wavs[i] = wav / wav.std()

    window = int(sr * time / bars)
    stride = int(window / oversample)

#

envs is a list of env over channels

    envs = []
    for wav in wavs:
        env = envelope(wav, window, stride)
        env = np.pad(env, (bars // 2, 2 * bars))
        envs.append(env)

    duration = len(wavs[0]) / sr
    frames = int(rate * duration)
    smooth = np.hanning(bars)

    print("Generating the frames...")
    for idx in tqdm.tqdm(range(frames), unit=" frames", ncols=80):
        pos = ((idx / rate) * sr) / stride / bars
        off = int(pos)
        loc = pos - off
        denvs = []
        for env in envs:
            env1 = env[off * bars : (off + 1) * bars]
            env2 = env[(off + 1) * bars : (off + 2) * bars]

#

we want loud parts to be updated faster

            maxvol = math.log10(1e-4 + env2.max()) * 10
            speedup = np.clip(interpole(-6, 0.5, 0, 2, maxvol), 0.5, 2)
            w = sigmoid(speed * speedup * (loc - 0.5))
            denv = (1 - w) * env1 + w * env2
            denv *= smooth
            denvs.append(denv)
        draw_env(denvs, tmp / f"{idx:06d}.png", (fg_color, fg_color2), fg_opacity, bg_color, image, center, size)

    audio_cmd = []
    if seek is not None:
        audio_cmd += ["-ss", str(seek)]
    audio_cmd += ["-i", audio.resolve()]
    if duration is not None:
        audio_cmd += ["-t", str(duration)]
    print("Encoding the animation video... ")

#

https://hamelot.io/visualization/using-ffmpeg-to-convert-a-set-of-images-into-a-video/

    sp.run(
        [
            "ffmpeg",
            "-y",
            "-loglevel",
            "panic",
            "-r",
            str(rate),
            "-f",
            "image2",
            "-s",
            f"{output_size[0]}x{output_size[1]}",
            "-i",
            "%06d.png",
        ]
        + audio_cmd
        + ["-c:a", "aac", "-vcodec", "libx264", "-crf", "10", "-pix_fmt", "yuv420p", "-shortest", out.resolve()],
        check=True,
        cwd=tmp,
    )

#

Given a comma separated rgb(a) colors, returns a 4-tuple of float.

def parse_color(colorstr):

#

    try:
        r, g, b = (float(i) for i in colorstr.split(","))
        return r, g, b
    except ValueError:
        fatal("Format for color is 3 floats separated by commas 0.xx,0.xx,0.xx, rgb order")
        raise

#

Given a comma separated float x and y coords, returns a tuple of float.

def parse_coords(coordsstr):

#

    try:
        x, y = (float(i) for i in coordsstr.split(","))
        return x, y
    except ValueError:
        fatal("Format for coords is 2 floats separated by commas 0.x,0.y, xy order")
        raise

#

def main():
    parser = argparse.ArgumentParser("seewav", description="Generate a nice mp4 animation from an audio file.")
    parser.add_argument("-r", "--rate", type=int, default=60, help="Video framerate.")
    parser.add_argument("--stereo", action="store_true", help="Create 2 waveforms for stereo files.")
    parser.add_argument(
        "-c",
        "--color",
        default=[0.03, 0.6, 0.3],
        type=parse_color,
        dest="color",
        help="Color of the bars as `r,g,b` in [0, 1].",
    )
    parser.add_argument(
        "-c2",
        "--color2",
        default=[0.5, 0.3, 0.6],
        type=parse_color,
        dest="color2",
        help="Color of the second waveform as `r,g,b` in [0, 1] (for stereo).",
    )
    parser.add_argument("-o", "--opacity", type=float, default=1, help="The opacity of the waveform on the background.")
    parser.add_argument(
        "-b",
        "--background",
        default=[0, 0, 0],
        type=parse_color,
        dest="background",
        help="Set the background. r,g,b` in [0, 1]. Default is black (0,0,0).",
    )
    parser.add_argument("--white", action="store_true", help="Use white background. Default is black.")
    parser.add_argument("-i", "--image", dest="image", help="Set the background image.")
    parser.add_argument("-B", "--bars", type=int, default=50, help="Number of bars on the video at once")
    parser.add_argument("-O", "--oversample", type=float, default=4, help="Lower values will feel less reactive.")
    parser.add_argument("-T", "--time", type=float, default=0.4, help="Amount of audio shown at once on a frame.")
    parser.add_argument(
        "-S", "--speed", type=float, default=4, help="Higher values means faster transitions between frames."
    )
    parser.add_argument("-W", "--width", type=int, default=480, help="width in pixels of the animation")
    parser.add_argument("-H", "--height", type=int, default=300, help="height in pixels of the animation")
    parser.add_argument(
        "-C",
        "--center",
        default=[0.5, 0.5],
        type=parse_coords,
        dest="center",
        help="The center of the bars relative to the image.",
    )
    parser.add_argument("-s", "--seek", type=float, help="Seek to time in seconds in video.")
    parser.add_argument("-d", "--duration", type=float, help="Duration in seconds from seek time.")
    parser.add_argument("audio", type=Path, help="Path to audio file")
    parser.add_argument(
        "out", type=Path, nargs="?", default=Path("out.mp4"), help="Path to output file. Default is ./out.mp4"
    )
    args = parser.parse_args()
    with tempfile.TemporaryDirectory() as tmp:
        visualize(
            args.audio,
            Path(tmp),
            args.out,
            seek=args.seek,
            duration=args.duration,
            rate=args.rate,
            bars=args.bars,
            speed=args.speed,
            oversample=args.oversample,
            time=args.time,
            fg_color=args.color,
            fg_color2=args.color2,
            fg_opacity=args.opacity,
            bg_color=[1.0] * 3 if bool(args.white) else args.background,
            bg_image=args.image,
            center=args.center,
            size=(args.width, args.height),
            stereo=args.stereo,
        )


if __name__ == "__main__":
    main()