txt_to_audio_polly.py

#

Convert text to audio using AWS Polly

Usage: ./txt_to_audio_polly.py -i input.txt

It is also possible to use the AWS_PROFILE environment variable to specify the AWS profile to use. Otherwise you can use the -p/–profile option to specify the profile to use. ./txt_to_audio_polly.py -i input.txt -p my_profile

import logging
import os
import subprocess
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from pathlib import Path

import boto3
#
def setup_logging(verbosity):
    logging_level = logging.WARNING
    if verbosity == 1:
        logging_level = logging.INFO
    elif verbosity >= 2:
        logging_level = logging.DEBUG

    logging.basicConfig(
        handlers=[
            logging.StreamHandler(),
        ],
        format="%(asctime)s - %(filename)s:%(lineno)d - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging_level,
    )
    logging.captureWarnings(capture=True)
#
def parse_args():
    parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument(
        "-i",
        "--input",
        type=Path,
        required=True,
        help="Input file",
    )
    parser.add_argument(
        "-p",
        "--profile",
        type=str,
        required=False,
        default=os.getenv("AWS_PROFILE", "default"),
        help="AWS Profile to use. If not provided then it'll use the AWS_PROFILE environment variable",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        dest="verbose",
        help="Increase verbosity of logging output",
    )
    return parser.parse_args()
#
def yield_text_by_paragraphs(text):
    yield from text.splitlines()
#
def yield_text_by_fullstops(text):
    counter = 1
    para = ""
    for sentence in text.split("."):
        if len(para) + len(sentence) > 1500:
            yield para, counter
            counter += 1
            para = ""
        para += sentence + "."
    yield para, counter
#

Combine all mp3 files using ffmpeg

def combine_files(output_directory, output_file_base):
#
    output_file = output_directory.joinpath(f"{output_file_base}.mp3").as_posix()
    files = "|".join([f.as_posix() for f in output_directory.glob(f"{output_file_base}-*.mp3")])
    cmd = f"ffmpeg -y -i 'concat:{files}' -c copy {output_file}"
    subprocess.run(cmd, shell=True, check=True)
#
def main(args):
    session = boto3.Session(profile_name=args.profile)
    polly = session.client("polly")
    input_file = args.input
    output_directory: Path = input_file.parent
    output_file_base = input_file.stem
    with open(input_file) as f:
        text = f.read()
        for para, counter in yield_text_by_fullstops(text):
            response = polly.synthesize_speech(
                OutputFormat="mp3",
                Text=para,
                TextType="text",
                VoiceId="Matthew",
            )
            with open(output_directory.joinpath(f"{output_file_base}-{counter}.mp3").as_posix(), "wb") as out:
                out.write(response["AudioStream"].read())
    combine_files(output_directory, output_file_base)


if __name__ == "__main__":
    args = parse_args()
    setup_logging(args.verbose)
    main(args)