r"""
Grab links from HN Post and generate Markdown post with image thumbnails
It also creates a Hugo blog post from Markdown and images generated
SUPPORT: To regenerate thumbnail, just delete the image file under thumbnails folder inside the post directory.
SUPPORT: To remove any link from the blog post, delete the entry after the post is created **in the blog directory**
Note down all the links somewhere then run the following command from blog directory to delete them
E.g. Image links will be like


$ pbpaste | awk -F\/ '{print $6}' | tr -d ')' | while read img; do find . -name $img -delete; done # noqa: W605
Usage:
$ python hn-links.py -l https://news.ycombinator.com/item?id=25381191 -b <blog_directory> --open-in-editor
"""
import logging
import os
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from datetime import datetime
from pathlib import Path
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from py_executable_checklist.workflow import WorkflowBase, run_command, run_workflow
from common_utils import fetch_html_page, html_parser_from
UTF_ENCODING = "utf-8"Common functions
def fetch_html(url, post_html_page_file):
    logging.info(f"Fetching HTML title for {url}")
    if post_html_page_file.exists():
        logging.info(f"🌕 Loading page from cache {post_html_page_file}")
        return post_html_page_file.read_text(encoding=UTF_ENCODING)
    page_html = fetch_html_page(url)
    logging.info(f"Caching page {post_html_page_file}")
    post_html_page_file.write_text(page_html, encoding=UTF_ENCODING)
    return page_htmldef relative_image_directory():
    now = datetime.now()
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    return f"images/{year}/{month}/{day}"Workflow steps
Create output folder using Post id in the temporary folder
class CreateOutputFolder(WorkflowBase):    hn_link: str    def run(self, context):
        hn_post_id = parse_qs(urlparse(self.hn_link).query).get("id")[0]
        download_folder = f"{os.getcwd()}/.temp"
        target_folder = Path(download_folder) / hn_post_id
        child_links_folder = target_folder / "links"
        thumbnails_folder = target_folder / "thumbnails"
        for f in [target_folder, child_links_folder, thumbnails_folder]:
            f.mkdir(parents=True, exist_ok=True)output
        context["hn_post_id"] = hn_post_id
        context["target_folder"] = target_folder
        context["child_links_folder"] = child_links_folder
        context["thumbnails_folder"] = thumbnails_folderUse requests to download HTML using a browser user agent
class GrabPostHtml(WorkflowBase):    hn_link: str
    target_folder: str    def run(self, context):
        post_html_page_file = Path(self.target_folder) / "hn_post.html"
        page_html = fetch_html(self.hn_link, post_html_page_file)output
        context["page_html"] = page_htmlCreate BeautifulSoap parser from html
class ParsePostHtml(WorkflowBase):    page_html: str    def run(self, context):output
        context["bs"] = html_parser_from(self.page_html)Extract page title using BeautifulSoap HTML parser
class GrabPostTitle(WorkflowBase):    bs: BeautifulSoup    def run(self, context):output
        context["hn_post_title"] = self.bs.title.stringExtract all links
class ExtractAllLinksFromPost(WorkflowBase):    bs: BeautifulSoup    def run(self, context):
        all_links = {link.get("href") for link in self.bs.find_all("a", href=True)}output
        context["all_links"] = all_linksWrite all links to file so that the next script can read them
class WriteLinksToFile(WorkflowBase):    all_links: set    def run(self, context):
        links_file = Path(context["target_folder"]) / "links.txt"
        links_file.write_text("\n".join(self.all_links), encoding=UTF_ENCODING)
        context["links_file"] = links_fileCall other script to download thumbnails and generate Hugo post
class CallLinksToHugoScript(WorkflowBase):    links_file: Path
    hn_post_title: str
    blog_directory: str    def run(self, _):
        cmd = (
            f"./venv/bin/python3 links_to_hugo.py "
            f'--links-file "{self.links_file}" '
            f'--post-title "{self.hn_post_title}" '
            f'--blog-directory "{self.blog_directory}"  '
            f"--open-in-editor"
        )
        run_command(cmd)Workflow definition
def workflow_steps():
    return [
        CreateOutputFolder,
        GrabPostHtml,
        ParsePostHtml,
        GrabPostTitle,
        ExtractAllLinksFromPost,
        WriteLinksToFile,
        CallLinksToHugoScript,
    ]Boilerplate
def setup_logging():
    logging.basicConfig(
        handlers=[logging.StreamHandler()],
        format="%(asctime)s - %(filename)s:%(lineno)d - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
    )
    logging.captureWarnings(capture=True)def main(args):
    context = args.__dict__
    run_workflow(context, workflow_steps())def parse_args():
    parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-l", "--hn-link", required=True, type=str, help="Link to HN Post")
    parser.add_argument(
        "-b",
        "--blog-directory",
        type=str,
        required=True,
        help="Full path to blog directory",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        default=False,
        dest="verbose",
        help="Display context variables at each step",
    )
    return parser.parse_args()
if __name__ == "__main__":
    setup_logging()
    args = parse_args()
    main(args)